Data Scientists Toolbox

This class is rather elementary, therefore there won’t be a whole lot of explanation here. The first week provides an overview of all of the courses in the Data Science specialization. The second week provides an introduction to using the command line, github, markdown, and installing packages in R. The third week gives an overview of what data science is, what data is, and briefly discusses experimental design.


Getting and cleaning data

I found this class to be extremely useful. It deals with the ins and outs of obtaining and cleaning data from many sources. The techniques discussed here are must-have’s for analyzing data in R.

Cleaning data

  • Data cleaning refers to any type of manipulation you have to do to the data in its raw form in order to prepare it for analysis
  • This often requires reshaping, merging, and subsetting data, but also can envolve dealing with dates, regular expressions, and obtaining data via the internet.

Reshaping data

# melt data (specify id's and measure variables)
mtcars$carname <- rownames(mtcars)
carMelt <- melt(mtcars, id = c("carname", "gear", "cyl"), measure.vars = c("mpg", "hp"))

# cast data using dcast() (left of tilde for rows, right of tilde for columns)
cylData <- dcast(carMelt, cyl ~ variable, mean)

# sum values (split/apply/combine)
library(plyr)
ddply(df, .(factor), summarize, sum=sum(countCOl))

Merging data

# merge data frames
merge(df1, df2, by.x = "df1Col", by.y = "df2Col", all = TRUE)

# merge using plyr by common id
library(plyr)
arrange(join(df1, df2), id)

# merge multiple dataframes using list
dfList <- list(df1, df2, df3)
join_all(dfList)

Loading and acquiring data

# load large data (gets class of each column before loading full dataset to save memory)
initial <- read.table("datafile.txt", nrows = 100)
classes <- sapply(initial, class)
df <- read.table("datafile.txt", colClasses = classes)
    
# check for subdirectory within working directory 
# and create it if it does not exist
if (!file.exists("data")) {
    dir.create("data")
}

xlsx

# read .xlsx file 
library(xlsx)
df <- read.xlsx("path/to/file.xlsx", sheetIndex = 1, header = TRUE)

Download file from the internet

# download file from internet
fileUrl <- ""
download.file(fileUrl, destfile = "./data/filename.csv", method = "curl")
list.files("./data")

Get date

# Get date
dateDownloaded <- date()
dateDownloaded

XML

# download xml file from internet
library(XML)
    
fileUrl <- ""
doc <- xmlTreeParse(fileUrl, useInternal = TRUE)
rootNode <- xmlRoot(doc)
xmlName(rootNode)
names(rootNode)

JSON

# download JSON data from internet
library(jsonlite)
jsonData <- fromJSON("url")
names(jsonData)

MySQL

# MySQL
library(RMySQL)
ucscDb <- dbConnect(MySQL(), user = "genome",
                    host = "genome-mysql.cse.ucsc.edu")
result <- dbGetQuery(ucscDB, "show databases;"); dbDisconnect(ucscDb);
# ex
hg19 <- dbConnect(MySQL(), user = "genome", db = "hg19",
                    host = "genome-mysql.cse.ucsc,edu")
allTables <- dbListTables(hg19)
length(allTables)
    
allTables[1:5]
dbListFields(hg19, "affyU133Plus2")
dbGetQuery(hg19, "select count(*) from a affyU133Plus2")

# read from the table
affyData <- dbReadTable(hg19, "affyU133Plus2")
head(affyU133Plus2)

# select a specific subset
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
affyMis <- fetch(query); quantile(affyMis$misMatches)
    
affyMisSmall <- fetch(query, n=10); dbClearResult(query);
dim(affyMisSmall)

dbDisconnect(hg19)

hdf5

# hdf5 data
source("http://bioconductor.org/biocLite.R")
biocLite("rhdf5")

library(rhdf5)
create = h5createFile("example.h5")
created

# create groups
created = h5createGroup("exmaple.h5", "foo")
created = h5createGroup("exmaple.h5", "baa")
created = h5createGroup("exmaple.h5", "foo/foobaa")

# look at it
h5ls("example.h5")

# write to groups
A = matrix(1:10, nr=5, nc=2)
h5write(A, "example.h5", "foo/A")
B = array(seq(0,1,2.0, by=0.1), dim=c(5,2,2))
attr(B, "scale") <- "liter"
h5write(B, "example.h5", "foo/foobaa/B")
h5ls("example.h5")

# write a dataset
df = data.frame(1L:5L, seq(0,1, length.out=5),
    c("ab", "cde", "fghi", "a", "s"), stringsAsFactors=FALSE)
h5write(df, "example.h5", "df")
h5ls("example.h5")

# reading data
readA = h5read("example.h5", "foo/A")
readB = h5read("example.h5", "foo/foobaa/B")
readdf= h5read("example.h5", "df")
readA

# writing and reading chunks
h5write(c(12,13,14), "example.h5", "foo/A", index=list(1:3,1))
h5read("example.h5", "foo/A")

Webscraping

# webscrapping
con = url("http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en")
htmlCode = readLines(con)
close(con)
htmlCode
  • this gives you one really long line of html, must parse data in one of 2 ways
# 1
library(XML)
url <- "http://scholar.google.com/citations?user=HI-I6C0AAAAJ&hl=en"
html <- htmlTreeParse(url, useInternalNodes=T)

xpathSApply(html, "//title", xmlValue)

xpathSApply(html, "//td[@id='col-citedby']", xmlValue)

# 2 
library(httr); html2 = GET(url)
content2 = content(html2, as="text")
parsedHtml = htmlParse(content2, asText=TRUE)
xpathSApply(parsedHtml, "//title", xmlValue)

# if webpage requires pw, you must do the following
pg2 = GET("httpbin.org/basic-auth/user/passwd", 
    authenticate("user", "passwd"))
pg2

APIs

# twitter
library(httr)
myapp = oauth_app("twitter",
                    key = "yourCOnsumerKeyHere", secret = "yourConsumerSecretHere")
sig = sign_oauth1.0(myapp,
                    token = "yourTokenHere",
                    token_secret = "yourTokenSecretHere")
homeTL = GET("https://api.twitter.com/1.1/statuses/home_timeline.json", sig)

# converting the json object
json1 = content(homeTL)
json2 = jsonlite::fromJSON(toJSON(json1))
json2[1,1:4]

Github

# Github
library(httr)
    
# 1. Find OAuth settings for github:
#    http://developer.github.com/v3/oauth/
oauth_endpoints("github")

# 2. Register an application at https://github.com/settings/applications
#    Insert your values below - if secret is omitted, it will look it up in
#    the GITHUB_CONSUMER_SECRET environmental variable.
#
#    Use http://localhost:1410 as the callback url
myapp <- oauth_app("github", "2bc549002113db38fcde")

# 3. Get OAuth credentials
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)

# 4. Use API
req <- GET("https://api.github.com/rate_limit", config(token = github_token))
stop_for_status(req)
content(req)

Editing text variables

# change case
tolower()
toupper()
    
# split strings
strsplit(x, "\\.")
    
# strip out first element 
firstElement <- function(x) {
    x[1]
}
sapply(vector, firstElement)

# substitute characters from vector
sub("oldchar", "newchar", vector)

# substitute characters from vector
gsub("oldchar", "newchar", vector)

Finding values

# find value
grep("searchstring", df$col)

# find value, show token
grep("searchstring", df$col, value = TRUE)

# find out if value ocurrs
length(grep("searchstring", df$col))

# find value - return logical vector in table
table(grepl("searchstring", df$col))

# subset using grepl
newdf <- olddf[!grepl("searchstring", olddf$col),]

String fucntions

# find # of chars
library(stringr)
nchar("Joseph Casillas")
## [1] 15
# select specific chars (1st-6th char)
substr("Joseph Casillas", 1, 6)
## [1] "Joseph"
# paste elements together
paste("Joseph", "Casillas")
## [1] "Joseph Casillas"
# paste elements together with no space
paste0("Joseph", "Casillas")
## [1] "JosephCasillas"
# remove trailing white space
str_trim("Joseph     ")
## [1] "Joseph"

Regular expressions

  • Used for searching for text
  • Combination of literals (words) and metacharacters (grammar)
  • Metacharacters used to specify more general chars
    • whitespace
    • word boundaries
    • sets of literals
    • beginning and end of a line
    • alternatives (‘this’ or ‘that’)

Metacharacters

  • ^ matches at the start of a line
    • ex. ^i think
  • $ matches the end of a line
    • ex. morning$
  • [] (character class) matches either/or inside of the brackets
    • ex. [Bb][Uu][Ss][Hh] matches any iteration of bush/Bush/bUsH/etc.
  • You can specify a range inside of []
    • ex. ^[0-9][a-zA-Z] matches any number followed by any letter at the beginning of a line
  • ^ inside of a character class ([]) will not match the characters in the class
    • ex. [^?.]$ will match any line that doesn’t end with a ? or a .
  • . matches any character
    • ex. 9.11 will match a 9, followed by anything, followed by 11
  • | combines two expressions. It translates to “or”
    • ex. flood|fire will match “flood” or “fire”
    • ex. ^[Gg]ood|[Bb]ad will match “good” (upper or lower case) at the beginning of a line or “bad” anywhere
    • ex. ^([Gg]ood|[Bb]ad) same as above, but both alternatives are restricted to beginning of the line
  • ? directly after () indicates that the enclosed expression is opitonal
    • ex. [Gg]eorge( [Ww]\.)? [Bb]ush matches “George Bush” and “George W. Bush” (lower or uppercase)
  • * matches whatever is directly before it any number of times (including none). * is greedy (it looks for the max), but can be made ‘less greedy’ by following it with a ? (i.e. ^s(.*?)s$)
    • ex. (.*) will match anything between partenthesis… “(hello how are you)” and “()”
  • + matches whatever is directly before it one or more times (not including none)
    • ex. [0-9]+ (.*)[0-9]+ will match atleast one number followed by any number of characters followed by atleast one number
  • {} are interval quantifiers that specify the minimum and maximum number of matches of an expression. This can be {x,y} (atleast x, but not more than y), {x,} (atleast x matches), or {x} (exactly x matches)
    • ex. [Bb]ush( +[^ ]+ +){1,5} debate will match “Bush” and “debate” with between one to five words in between them
  • () are scope limiters. Whatever is matched between them can be repeated using escaped numbers, i.e. \1
    • +([a-zA-Z]+) +\1 + will match repeated words followed and preceeded by a space (i.e. “night night”, “so so”, etc.)

Working with dates

  • dates can be formatted
    • %d = day as number (0-31)
    • %a = abbreviated weekday
    • %A = unabbreviated weekday
    • %m = month (00-12)
    • %b = abbreviated month
    • %B = unabbreviated month
    • %y = 2 digit year
    • %Y = 4 digit year
    • ex. d2 <- Sys.date returns “2014-01-12”. If we run format(d2 "%a %b %d") it outputs as “Sun Jan 12”
  • You can create dates from a character vector with as.Date(x, "")
    • ex. x <- c("1jan1960", "2jan1960"); z <- as.Date(x, "%d%b%Y")
    • This would return [1] "1960-01-01" "1960-01-02"
  • You can substract/add dates: z[2] - z[1] returns Time difference of 1 days. This can also be returned as a numeric vector: as.numeric(z[2] - z[1])
  • You can convert to Julian
    • weekdays()
    • months()
    • julian()
  • library(lubridate) makes working with dates easier
    • ymd()
    • mdy()
    • dmy()
    • ymd_hms()
    • wday(x[]), wday(x[], label = TRUE)
  • posixLT
  • posixCT

str() function

  • Answers questions ‘what’s in this object?’
  • Handy for finding the arguments of a function
  • Really you can call it on just about anything

Data resources

Non-governmental sites

APIs

  • twitter and twitteR package
  • figshare and rfigshare
  • PLos and rplos
  • rOpenSci
  • Facebook and RFacebook
  • Google maps and RGoogleMaps

Other useful packages

Searching for relavant package

  • “data storage mechanism” + R package
  • ex. mySQL R package

interacting with files

  • file()
  • url()
  • gzfile()
  • bzfile()
  • ?connections

  • foreign package
  • read.XXX

reading images (packages and functions)

  • jpeg
  • readbitmap
  • png
  • EBImage

GIS data

  • rdgal
  • rgeos
  • raster

reading music data

  • tuneR
  • seewave

R Programming

Setup

Snippets and functions that are useful for initially setting up the R workspace.

# cleanup global environment
rm(list = ls(all = TRUE))

Utilities

General purpose gems that everybody should know

# make table of dataset
table(df$col, useNA="ifany")

# xtabs
xt <- xtabs(Freq ~ var1 + var2, data = df)

# check for NAs (other useful commands... any(), all(),)
sum(is.na(df$col))

# row and column sums
colSums(is.na(df))
rowSums(is.na(df))
    
# size of dataset
print(object.size(df), units = "Mb")

# creating sequences
seq(1, 10, by = 2)
seq(1, 10, length = 3)
seq(along = x)

# generate levels of a factor
gl(2, 8, labels = c("Control", "Treat"))

Subsetting

Subsetting in many different ways.

# find out if there are values w/ specific characteristics
table(df$col %in% c("value1", "value2"))

# subset dataframe based on special characteristics specified above
df[df$col %in% c("value1", "value2"),]

# subset using % % (returns T/F)
df$newcol <- df$someCol %in% c("level1", "level2")

# create binary variables (if condition is true, add true in new col)
df$newcol <- ifelse(df$someCol < 0, TRUE, FALSE)

# 'cut' - create categorical (factor) variable from continuous variable
df$newCol <- cut(df$someCol, breaks = quantile(df$someCol))

# 'cut' with plyr
library(plyr)
newDF <- mutate(oldDF, newCol = cut2(oldCol, g=4))

Control structures

Genreal programming basics. Useful for all languages.

  • if, else
  • for
  • while
  • repeat
  • break
  • next
  • return

Mainly for writing programs. When using CLI, better to use *apply.

if, else: testing a condition

# ex 1
if(<condition>) {
    ## do something
} else {
    ## do something else
}

# ex 2
if(<condition>) {
    ## do something
} else if(<condition>) {
    ## do something different
} else {
    ## do something different
}

for: execute a loop a fixed number of times

x <- c("a", "b", "c", "d")

# Ex 1
for(i in 1:4) {
    print(x[i])
}
## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
# Ex 2
for(i in seq_along(x)) {
    print(x[i])
}
## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
# Ex 3
for(letter in x) {
    print(letter)
}
## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
# Ex 4
for(i in 1:4) print(x[i])
## [1] "a"
## [1] "b"
## [1] "c"
## [1] "d"
  • ‘for’ loops can be nested
x <- matrix(1:6, 2, 3)

for(i in seq_len(nrow(x))) {
    for(j in seq_len(ncol(x))) {
        print(x[i, j])
    }
}
## [1] 1
## [1] 3
## [1] 5
## [1] 2
## [1] 4
## [1] 6

while: execute a loop while a condition is true

# Ex 1
count <- 0
while(count <= 10) {
    print(count)
    count <- count +1
}
## [1] 0
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
# Ex 2
z <- 5
while(z >= 3 && z <= 10) {
    print(z)
    coin <- rbinom(1, 1, 0.5)

    if(coin == 1) { ## random walk
        z <- z + 1
    } else {
        z <- z - 1
    }
}
## [1] 5
## [1] 6
## [1] 7
## [1] 6
## [1] 5
## [1] 6
## [1] 5
## [1] 4
## [1] 5
## [1] 6
## [1] 5
## [1] 4
## [1] 5
## [1] 6
## [1] 5
## [1] 4
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 5
## [1] 6
## [1] 5
## [1] 6
## [1] 5
## [1] 4
## [1] 3

repeat: execute an infinite loop (this one is dangerous)

x0 <- 1
tol <- 1e-8
    
repeat {
    x1 <- computeEstimate()

    if(abs(x1 - x0) < tol) {
        break
    } else {
        x0 <- x1
    }
}

break: break the execution of a loop

next: skip an interation of a loop

for(i in 1:100) {
    if(i <= 20) {
        ## skip the first 20 iterations
        next
    }
    ## do something here
}

return: exit a function

  • signals that fucntion should exit and return a value

FUNctions

  • This is the next step for really getting good at R and having a better understanding of how it works.
  • Fucntions can be passed as arguments to other functions
  • Functions can be nested inside other functions
f <- function(<arguments>) {
    ## do something interesting
}
  • functions have named arguments
  • formal arguments are included in function def. (ie above)
  • formals function returns a list of all the formal arguments of a function
  • not every call will make use of all the formal arguments
  • arguments can be missing or have default values
  • argument matching (exact > partial > positional)

Defining a fucntion

  • You can define the default values of an argument, leave them blank, or set them to NULL. Functions use lazy evaluation. This means that if an argument is stipulated in the function, but not included in the function, it will still work correctly.
f <- function(a, b) {
    a^2
}

f(2)
## [1] 4

This will return 4. 2 is positionally matched to a and b is never evaluated.

f <- function(a, b) {
    print(a)
    print(b)
}

f(45)
  • This example will return 45 and then give an error. This is because there is a value positionally matched to a, but not to b. So when R evaluates b it gives an error because of the missing argument after it has already evaluated a.

the “…” argument

  • The ... argument is good for cases when you are extending the functionality of an already established function.
myplot <- function(x, y, type = "1", ...) {
    plot(x, y, type = type, ...)
}
  • This will extend plot() to default to type = "1" and use all of the other default arguments of the original function. Generic functions also use ..., but this hasn’t been explained yet. It is also used in functions in which the number of necessary arguments cannot be known in advance.
args(paste)
function(..., sep = " ", collapse = NULL)
  • All arguments after ... have to be named explicitly. If not, R will ignore partial matching or give an error.

My first function

add2 <- function(x, y) {
    x + y
}
  • This function will add together any two numbers.
above10 <- function(x) {
    use <- x > 10
    x[use]
}
  • This function will return a logical vector that subsets elements of x that are greater than 10.
above <- function(x, n = 10) {
    use <- x > n
    x[use]
}
  • This function is the same as the previous one; however it has two arguments. Specifically you can choose the threshold at which the subset starts (in this case n). It will default to 10 if nothing is specified.
    x <- 1:20
    above(x, 12)
## [1] 13 14 15 16 17 18 19 20
  • …will return all the numbers between 1 and 20 that are greater than 12. Now for something more complex…
columnmean <- function(x, removeNA = TRUE) {
    nc <- ncol(x) # get the number of cols of df/matrix
    means <- numeric(nc) # initialize vector that will store means of each column
    for(i in 1:nc) {
        means[i] <- mean(x[, i], na.rm = removeNA)
    }
    means
}
  • This function returns the column means of a matrix or dataframe. It involves taking the argument and looping through each column, calculating the mean of each one.

Loop functions

  • Loop a function over a vector, matrix, df, etc. The ‘apply’ family.
    • apply()
    • lapply()
    • sapply()
    • tapply()
    • mapply()
    • split()
    • anonymous functions

Examples

lapply()

  • loop over a list of objects and apply a function over every element of the list.
  • takes three arguements
    1. a list
    2. a function
    3. other arguments via ...
  • if object (x) is not a list, it is coerced to a list
  • Always returns a list.

Ex.

x <- list(a = 1:5, b = rnorm(10))
lapply(x, mean)
## $a
## [1] 3
## 
## $b
## [1] -0.1031074

sapply()

  • like lapply, but tries to simplify to a vector
  • Same arguements as lapply()
  • does not return a list, but rather a vector (or a matrix)
  • if sapply() can’t figure it out, it returns a list

Ex.

x <- list(a = 1:4, b = rnorm(10), c = rnorm(20, 1), d = rnorm(100, 5))
lapply(x, mean) # returns list
## $a
## [1] 2.5
## 
## $b
## [1] -0.07106553
## 
## $c
## [1] 1.163607
## 
## $d
## [1] 5.012901
sapply(x, mean) # returns vector
##           a           b           c           d 
##  2.50000000 -0.07106553  1.16360668  5.01290117

anonymous functions

  • a function that does not have a name

Ex.

lapply(x, fucntion(elt) elt[, 1])

apply()

  • evaluates a function over the margins of an array
  • often used to apply a function to the rows or columns of a matrix
  • can be used with general arrays
  • not faster than loop, but is a 1-liner
  • x is an array
  • MARGIN is an integer vector indicating which margins should be retained
    • 1 = rows
    • 2 = columns
  • FUN is a functioni to be applied
  • ... is for other arguements to be passed to FUN
  • Normally returns a vector
    • If you run apply() on a 3d array, you can perserve 2 diminsions and return a matrix

Ex.

x <- matrix(rnorm(200), 20, 10) 
apply(x, 1, mean)
##  [1]  0.42417922  0.04910433  0.27391192 -0.51664794  0.42171934
##  [6] -0.25901877 -0.61139349 -0.18639265 -0.08993693  0.61883191
## [11]  0.24303317  0.18559808  0.66725016  0.44584964  0.12905192
## [16] -0.01529047  0.13415607 -0.07157860  0.44163923 -0.11329998
apply(x, 2, mean)
##  [1] -0.10231580  0.24086328  0.30112640 -0.12028531  0.11798669
##  [6]  0.03198587  0.07097659  0.27964122 -0.02763181  0.29303596
  • There are special functions that do the same thing as apply()
    • rowSums()
    • rowMeans()
    • colSums()
    • colMeans()

tapply()

  • applies a function over subsets of a vector
  • returns a table
  • x is a vector
  • INDEX is a factor or a list of factors
  • FUN is a function to be applied
  • ... contains other arguements to be passed to FUN
  • simplify is a logical to simplify result or not (if FALSE, you get a list)

Ex.

vot <- rnorm(20, 2, 5)
group <- gl(2, 10, 20, labels = c("en", "sp"))
treatment <- gl(4, 1, 20, labels = c("beg", "int", "med", "adv"))
df <- as.data.frame(cbind(group, treatment, vot))

tapply(df$vot, df$group, mean)
##        1        2 
## 1.671528 1.948244
tapply(df$vot, list(df$group, df$treatment), mean)
##           1        2         3        4
## 1 -0.625504 5.123076 -3.843420 5.454701
## 2  1.633227 4.245328  0.137493 2.437617

split()

  • splits objects into groups/subpieces
  • x is a vector (or list) or df
  • f is a factor (or coerced to one) or a list of factors
  • drop indicates whether empty factors levels should be dropped
  • often used with lapply() i.e. lapply(split(x, f), mean), but this is pretty much the same as tapply()
  • is handy for splitting a df

mapply()

  • multivariate version on apply
  • FUN is a function to apply
  • ... contains arguments to apply over
  • MoreArgs is a list of other arguments to FUN
  • SIMPLIFY logical that indicate if the result should be simplified or not

Ex.

  • Insetead of typing out
list(rep(1, 4), rep(2, 3), rep(3, 2), rep(4, 1))
## [[1]]
## [1] 1 1 1 1
## 
## [[2]]
## [1] 2 2 2
## 
## [[3]]
## [1] 3 3
## 
## [[4]]
## [1] 4
  • We can use…
mapply(rep, 1:4, 4:1)
## [[1]]
## [1] 1 1 1 1
## 
## [[2]]
## [1] 2 2 2
## 
## [[3]]
## [1] 3 3
## 
## [[4]]
## [1] 4

Debugging tools

Conditions

There are three types of condition, a generic concept for indicating that something unexpected can occur, this can be created by the programmer

  • message is a generic notification, very tame, function is still executed
  • warning indicates that something went wrong, but not a fail, function is still executed
  • error indicates that a fatal problem ocurred, execution stops

Questions to think about

  • what was your input? How did you call the function?
  • What were you expecting? Output, messages, other results?
  • What did you get?
  • How does what you get differ from what you were expecting?
  • Were your expectations correct in the first place?
  • Can you reproduce the problem (exactly)?

Tools for debugging

  1. traceback prints out the function call stack after an error occurs; does nothing if there’s no error
  2. debug flags a fucntion for ‘debug’ mode which allows you to step through execution of a function one line at a time
  3. browser suspends the execution of a function wherever it is called puts the function in debug mode
  4. trace allows you to insert debugging code into a function at specific places
  5. recover allows you to modify the error behavior so that you can browse the function call stack

Coding standards

  • important so that code is readible
  • allows people to understand what you did/why you did it
  1. use text editor, save as text file
  2. ident code
  3. limit width
  4. limit length of function

Simulation

basics

  • d = density
  • r = random
  • p = cumulative
  • q = quantile

  • rnorm(): random normal variates with given mean and std
  • dnorm(): evaluate the normal probability density
  • pnorm(): evalutate the cumulativa distribution function for a normal dist.
  • qnorm(): generate random poisson variates (count data)

Ex. normal dist.

dnorm(x, mean = 0, sd = 1, log = FALSE)
pnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
qnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
rnorm(n, mean = 0, sd = 1)

Generating random numbers from a linear model

linear regression

set.seed(20)
x <- rnorm(100)
e <- rnorm(100, 0, 2)
y <- 0.5 + 2 * x + e
summary(y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -6.4080 -1.5400  0.6789  0.6893  2.9300  6.5050
plot(x, y)

logistic regression

set.seed(10)
x <- rbinom(100, 1, 0.5)
e <- rnorm(100, 0, 2)
y <- 0.5 + 2 * x + e
summary(y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.4940 -0.1409  1.5770  1.4320  2.8400  6.9410
plot(x, y)

Count data

set.seed(1)
x <- rnorm(100)
log.mu <- 0.5 + 0.3 * x
y <- rpois(100, exp(log.mu))
summary(y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.00    1.00    1.55    2.00    6.00
plot(x, y)

Sample function

  • Draw a random from an object
  • Ex. sample(1:10, 4)
  • Can set replace = TRUE so that numbers from sample can be repeated

R profiler

  • Used for debugging/assessing code
  • Helps to optimize code, find out if/why it is slow

When and why?

  • Typically after you have thought about written your code

system.time()

  • evaluates r expression
  • user time: time the CPU needed to evaluate
  • elapsed time: real, measured clock/wall time

Rprof()

  • this function starts the profiler in R
  • summaryRprof() will summarize the output of Rprof()
  • Don’t use at the same time as system.time()

What does it do?

  • It keeps track of the function call stack at regularly sampled intervals
  • It then tabulates how much time is spent on each function
  • “This called this, which called that, which called…”

summaryRprof()

  • “by.total” = divides the time spent on each function by the total run time
    • time spent in top level function, generally less interesting
  • “by.self” = the same but first subtracts out time spent on functions above in the call stack
    • how much time spent in lower level functions, gives more accurate idea of what you should target to optimize

Optimization

  • optim, nlm, optimize
  • require you to pass a function whose argument is a vector of parameters
  • you can write a constructor function

Ex.

set.seed(1); normals <- rnorm(100, 1, 2)
nLL <- make.NegLogLik(normals)
nLL

make.NegLogLik <- function(data, fixed = c(FALSE, FALSE)) {
    params <- fixed
    function(p) {
        params[!fixed] <- p
        mu <- params[1]
        sigma <- params[2]
        a <- -0.5 * length(data) * log(2 * pi * sigma^2)
        b <- -0.5 * sum((data-mu)^2) / (sigma^2)
        -(a + b)
    }
}
  • Estimating parameters
optim(c(mu = 0, sigma = 1), nLL)$par
  • Fixing o = 2
nll <- make.NegLogLik(normals, c(FALSE, 2))
optimize(nLL, c(-1, 3))$minimum
  • Fixing u = 1
nll <- make.NegLogLik(normals, c(1, FALSE))
optimize(nLL, c(1e-6, 10))$minimum

Exploratory data analysis

Six principles of analytic graphics

  1. Show comparisons
    • Evidence for a hypothesis is always relative to another competing hypothesis
    • always ask “compared to what”?
  2. Show causality, mechanism, explanation
    • What is your causal framework for thinking about a question?
  3. Show multivariate data
    • Multivariate = more than 2 variables
    • the real world is multivariate
    • need to “escape flatland”
  4. Integrate multiple modes of evidence
    • Completely integrate words, numbers, images, diagrams
    • Data graphics should make use of many modes of data presentation
    • Don’t let the tool drive the analysis
  5. Describe and document the evidence
    • A data graphic should tell a complete story that is credible
  6. Content is king
    • Analytical presentations ultimately stand or fall depending on the quality, relevance, and integrity of their content

Exploratory graphs

  • Graphs you make for yourself to get an idea of whats going on (understand data properties, find patterns, suggest modeling strategies, debug analysis)
  • Goal = personal understanding

Simple summaries of data (1 dimension)

  • Five number summary
  • Boxplots
boxplot(pollution$mp25, col = "blue")
abline(h = 12)
  • Histograms
hist(pollution$pm25, col = "green", breaks = 100)  
rug(pollution$pm25
abline(v = 12, lwd = 2)
abline(v = median(pollution$pm25), col = "magenta", lwd = 4)
  • Density Plot
  • Barplot
barplot(table(pollution$region), col = "wheat", main = "Number of counties in each region")
  • 2 dimensions
    • Multiple/overlayed 1d plots (lattice/ggplot2)
    • Scatterplots
with(pollution, plot(latitude, pm25))
abline(h = 12, lwd = 2, lty = 2)

par(mfrow = c(1, 2), mar = c(5, 4, 2, 1))
with(subset(pollution, region == "west"), plot(latitude, pm25, main = "West"))
with(subset(pollution, region == "east"), plot(latitude, pm25, main = "East"))
  • Smooth scatterplots
  • > than 2d
  • Overlayed/multiple 2d plots, coplots
  • Use color, size, shape to add dimensions
  • Spinning plots
  • Actual 3d plot (not that useful)

Plotting systems

The base plotting system

  • Artist palette model
  • Use annotation functions to add/moddify (text, lines, points, axis)
  • Mirrors thought process
  • Cant go back, must plan in advance or start over
  • Series of R commands

Ex.

library(datasets)  
data(cars)  
with(cars, plot(speed, dist))

  • Generally graphing using base is a two step process…
    1. Initialize a new plot
    2. Annotate the existing plot
  • There are many parameters (should try to memorize ?par)

Ex.

library(datasets)
hist(airquality$Ozone)

with(airquality, plot(Wind, Ozone))

airquality <- transform(airquality, Month = factor(Month))
boxplot(Ozone ~ Month, airquality, xlab = "Month", ylab = "Ozone (ppb)")

Key parameters

  • pch: plotting symbol
  • lty: line type
  • lwd: line width
  • col: plotting color
  • xlab: x label (char string)
  • ylab: y label (char string)
  • par(): used to set global graphics parameters
    • las: orientation of the axis labels on the plot
    • bg: background color
    • mar: margin size
    • oma: outer margin size
    • mfrow: number of plots per row, column (filled row wise)
    • mfcol: number of plots per row, column (filled column wise)

Key plotting functions

  • plot: scatterplot, or type of plot that makes sense for the class of the object being plotted
  • lines: add lines to a plot, 2-column matrix, function connects the dots
  • points: add points to a plot
  • text: add text labels to a plot using specified x, y coordinates
  • title: add annotations to x, y axis labels, title, subtitle, outer margin
  • mtext: add arbitrary text to the margins (inner/outer) of the plot
  • axis: add axis ticks/labels

Ex. Base plot with annotation

library(datasets)
with(airquality, plot(Wind, Ozone))
title(main = "Ozone and Wind in New York City")

Ex. Base plot with subset and coloring

library(datasets)
with(airquality, plot(Wind, Ozone, main = "Ozone and Wind in New York City"))
with(subset(airquality, Month == 5), points(Wind, Ozone, col = "blue"))

Ex. Base plot with more subsetting and coloring

library(datasets)
with(airquality, plot(Wind, Ozone, main = "Ozone and Wind in New York City", type = "n"))
with(subset(airquality, Month == 5), points(Wind, Ozone, col = "blue"))
with(subset(airquality, Month != 5), points(Wind, Ozone, col = "red"))
legend("topright", pch = 1, col = c("blue", "red"), legend = c("May", "Other months"))

Ex. Base plot with regression line

library(datasets)
with(airquality, plot(Wind, Ozone, main = "Ozone and Wind in New York City", pch = 20))
model <- lm(Ozone ~ Wind, data = airquality)
abline(model, lwd = 2)

Ex. Multiple base plots

library(datasets)
model1 <- lm(Ozone ~ Wind, data = airquality)
model2 <- lm(Ozone ~ Solar.R, data = airquality)
par(mfrow = c(2, 1))
with(airquality, {
    plot(Wind, Ozone, main = "Ozone and Wind")
    abline(model1, lwd = 2)
    plot(Solar.R, Ozone, main = "Ozone and Solar Radiation")
    abline(model2, lwd = 2)
})

library(datasets)
model1 <- lm(Ozone ~ Wind, data = airquality)
model2 <- lm(Ozone ~ Solar.R, data = airquality)
model3 <- lm(Ozone ~ Temp, data = airquality)

par(mfrow = c(1, 3), mar = c(4, 4, 2, 1), oma = c(0, 0, 2, 0))
with(airquality, {
    plot(Wind, Ozone, main = "Ozone and Wind")
    abline(model1, lwd = 2)
    plot(Solar.R, Ozone, main = "Ozone and Solar Radiation")
    abline(model2, lwd = 2)
    plot(Temp, Ozone, main = "Ozone and Temperature")
    abline(model3, lwd = 2)
    mtext("Ozone and Weather in NYC", outer = TRUE)
})

Ex. Testing it out on my own…

x <- rnorm(100)
y <- x + rnorm(100)/3
g <- gl(2, 50, labels = c("Male", "Female"))
fit <- lm(y ~ x)

plot(x, y, type = "n")
points(x[g == "Male"], y[g == "Male"], col = "blue")
points(x[g == "Female"], y[g == "Female"], col = "red")
abline(fit, lwd = 2, col = "green")
legend("topleft", legend = c("Male", "Female"), pch = 1, col = c("blue", "red"))
title("Plot title")

Graphics devices

  • something you can use to make a plot appear
    • a window
    • a pdf file
    • a png or jpg file
    • a scalable vector graphics (SVG) file
  • Plots have to be sent to a graphics device (most common is a screen device)
  • Vector formats
    • pdf: useful fore line-type graphics, resizes well, usually portable, not efficient if a plot has many objects/points
    • svg: xml-based scalable vector graphics, suuposrt animation and interactivity, potentially useful for web-based plots
    • win.metafile: only for windows
    • postscript: older format, also resizes well, predecesor to pdf
  • Bitmap formats
    • png: bitmapped format good for drawing lines and images with solid colors, good for plots with many points
    • jpeg: does not resize well
    • tiff: supports lossless compression
    • bmp: windows
  • Copying plots
    • make a plot with screen device, then save it
    • dev.copy: copy a plot form one device to another
    • dev.copy2pdf: specifically copy a plot to a pdf file

Ex. code

library(datasets)
with(faithful, plot(eruptions, waiting)) # create plot
dev.copy(png, file = "testplot.png") # copy plot in screen device to a png file
dev.off() # close png device

The Lattice system

  • Plots constructed with function call (xyplot, bwplot)
  • Useful for conditioning plots, panel plots
  • Details set automatically because plot is specified at once
  • Cannot add to plot once it is generated
  • Main functions
    • xyplot(): main function for scatterplots
    • bwplot(): box and whisker plot, boxplot
    • histogram(): histograms
    • stripplot(): like bwplot, but with points
    • dotplot(): dots on violin strings
    • splot(): scatterplot matrix
    • levelplot(), contourplot(): for image data
  • Normally use a formula for first argument
  • Lattice returns an object of class trellis and this object is autoprinted
    • You can verify this by giving a trellis object a variable (i.e. p <-) and then printing said object (i.e. print(p))
    • This differs from when you do not assign the object to a variable (it is autoprinted)
xyplot(y ~ x | f * g, data = )

Ex. Basic scatterplot

library(lattice)
library(datasets)
xyplot(Ozone ~ Wind, data = airquality)

Ex. Scatterplot at distinct levels

library(datasets)
library(lattice)
airquality <- transform(airquality, Month = factor(Month))
xyplot(Ozone ~ Wind | Month, data = airquality, layout = c(5, 1))

Ex.

library(lattice)  
state <- data.frame(state.x77, region = state.region)  
xyplot(Life.Exp ~ Income | region, data = state, layout = c(4,1))

  • Panel fucntions
    • these control what happens inside each panel of the plot
    • there are default settings, they can be customized
    • they receive the x/y coordinates of the data points in their panel
set.seed(10)
x <- rnorm(100)
f <- rep(0:1, each = 50)
y <- x + f - f * x + rnorm(100, sd = 0.5)
f <- factor(f, labels = c("Group 1", "Group 2"))
xyplot(y ~ x | f, layout = c(2, 1))

Ex. w/ custom panel function (median line)

set.seed(10)
x <- rnorm(100)
f <- rep(0:1, each = 50)
y <- x + f - f * x + rnorm(100, sd = 0.5)
f <- factor(f, labels = c("Group 1", "Group 2"))
xyplot(y ~ x | f, panel = function(x, y, ...) {
    panel.xyplot(x, y, ...) # call default panel function to plot points
    panel.abline(h = median(y), lty = 2) # add horizontal line at median
})

Ex. w/ custom panel function (regression line)

set.seed(10)
x <- rnorm(100)
f <- rep(0:1, each = 50)
y <- x + f - f * x + rnorm(100, sd = 0.5)
f <- factor(f, labels = c("Group 1", "Group 2"))
xyplot(y ~ x | f, panel = function(x, y, ...) {
    panel.xyplot(x, y, ...) # call default panel function to plot points
    panel.lmline(x, y, col = 2) # add simple lm line
})

ggplot2

  • Splits difference between base and lattice
  • Aesthetics calculated at once
  • Superficially similar to lattice but more intuitive
  • Grammar of Graphics
    • mapping from data to aesthetic attributes (colour, shape, size) of geometric objects (pionts, lines, bars)
  • In ggplot2 plots need:
    • a dataframe
    • aesthetics: colour, shape, size, etc.
    • geoms: points lines, bars, etc.
    • facets: for conditional plots (basically subsetting in to multiple panels)
    • stats: statistical transformations (binning, quantiles, smoothing, regression)
    • scales: define how the variables are cooded (ex. male = red, female = blue)
    • coordinate system: how numerical representations are translated onto the plot
  • qplot() is the workhorse, similar to plot() in base r
  • ggplot() is core function, can do things qplot() cannot

qplot()

Ex. 1 - Hello world

library(ggplot2)  
## Loading required package: methods
data(mpg)
str(mpg)
## 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: Factor w/ 15 levels "audi","chevrolet",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ model       : Factor w/ 38 levels "4runner 4wd",..: 2 2 2 2 2 2 2 3 3 3 ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : Factor w/ 10 levels "auto(av)","auto(l3)",..: 4 9 10 1 4 9 1 9 4 10 ...
##  $ drv         : Factor w/ 3 levels "4","f","r": 2 2 2 2 2 2 2 1 1 1 ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : Factor w/ 5 levels "c","d","e","p",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ class       : Factor w/ 7 levels "2seater","compact",..: 2 2 2 2 2 2 2 2 2 2 ...
# 'hello world' of ggplot
qplot(displ, hwy, data = mpg)

qplot(displ, hwy, data = mpg, color = drv)

qplot(displ, hwy, data = mpg, geom = c("point", "smooth"))
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

qplot(displ, hwy, data = mpg, color = drv) + geom_smooth(method = "lm")

Ex. 2 - Histograms

# make historgram by only specifying one variable
qplot(hwy, data = mpg, fill = drv)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Ex. 3 - Facets

  • syntax is factor ~ factor
  • Left of the tilde is rows
  • Right of the tilde is columns
  • A . indicates that no variable is specified and usually equates to 1 row/column
qplot(displ, hwy, data = mpg, facets = .~drv)

qplot(hwy, data = mpg, facets = drv ~., binwidth = 2)

ggplot()

  • artists palette model
  • plots are built up in layers
    • plot datasets
    • overlay a summary
    • metadata and annotation

Colors in plots

  • grDevices() package is good for changing colors
  • two important Functions
    1. colorRamp()
    2. colorRampPalette()

Ex. Return RGB

pal <- colorRamp(c("red", "blue"))
pal(0)
##      [,1] [,2] [,3]
## [1,]  255    0    0
pal(1)
##      [,1] [,2] [,3]
## [1,]    0    0  255
pal(0.5)
##       [,1] [,2]  [,3]
## [1,] 127.5    0 127.5
pal(seq(0, 1, len = 10))
##            [,1] [,2]      [,3]
##  [1,] 255.00000    0   0.00000
##  [2,] 226.66667    0  28.33333
##  [3,] 198.33333    0  56.66667
##  [4,] 170.00000    0  85.00000
##  [5,] 141.66667    0 113.33333
##  [6,] 113.33333    0 141.66667
##  [7,]  85.00000    0 170.00000
##  [8,]  56.66667    0 198.33333
##  [9,]  28.33333    0 226.66667
## [10,]   0.00000    0 255.00000

Ex. Return hexadecimals (like HTML)

pal <- colorRampPalette(c("red", "yellow"))
pal(2)
## [1] "#FF0000" "#FFFF00"
pal(10)
##  [1] "#FF0000" "#FF1C00" "#FF3800" "#FF5500" "#FF7100" "#FF8D00" "#FFAA00"
##  [8] "#FFC600" "#FFE200" "#FFFF00"

Ex. Creating color combinations

library(RColorBrewer)
## Warning: package 'RColorBrewer' was built under R version 3.1.2
cols <- brewer.pal(3, "BuGn")
pal <- colorRampPalette(cols)
image(volcano, col = pal(20))

Ex. Smooth scatter function

x <- rnorm(10000)
y <- rnorm(10000)
smoothScatter(x, y)

Hierarchical clustering

  • Cluster data together to produce a tree that shows how close things are together
  • Must define close
  • Must decide how to group things together
  • Must decide how to visualize and interpret the groupings
  • Must use a defined distance
  • Must set a merging approach
  • This is done by measuring some type of distance
    • euclydean (continuous)
    • correlation (continuous)
    • manhattan (binary)

Ex.

set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))

Ex. Create a distance matrix

dataFrame <- data.frame(x = x, y = y)
# defaults to euclidean distance
dist(dataFrame)
##             1          2          3          4          5          6
## 2  0.34120511                                                       
## 3  0.57493739 0.24102750                                            
## 4  0.26381786 0.52578819 0.71861759                                 
## 5  1.69424700 1.35818182 1.11952883 1.80666768                      
## 6  1.65812902 1.31960442 1.08338841 1.78081321 0.08150268           
## 7  1.49823399 1.16620981 0.92568723 1.60131659 0.21110433 0.21666557
## 8  1.99149025 1.69093111 1.45648906 2.02849490 0.61704200 0.69791931
## 9  2.13629539 1.83167669 1.67835968 2.35675598 1.18349654 1.11500116
## 10 2.06419586 1.76999236 1.63109790 2.29239480 1.23847877 1.16550201
## 11 2.14702468 1.85183204 1.71074417 2.37461984 1.28153948 1.21077373
## 12 2.05664233 1.74662555 1.58658782 2.27232243 1.07700974 1.00777231
##             7          8          9         10         11
## 2                                                        
## 3                                                        
## 4                                                        
## 5                                                        
## 6                                                        
## 7                                                        
## 8  0.65062566                                            
## 9  1.28582631 1.76460709                                 
## 10 1.32063059 1.83517785 0.14090406                      
## 11 1.37369662 1.86999431 0.11624471 0.08317570           
## 12 1.17740375 1.66223814 0.10848966 0.19128645 0.20802789

Ex. Dendrogram

distxy <- dist(dataFrame)
hClustering <- hclust(distxy)
plot(hClustering)

K-means clustering

  • Useful for summarizing data
  • Must define close
  • Must determine how to group things together
  • Must determine how to visualize grouping
  • Must interpret grouping
  • Difference from Hierarchical cluster… a partitioning approach
    • fix a number of clusters
    • get centroids
    • assign things to closest centroid
    • recalculate centroids
  • This requires…
    • a defined distance metric
    • a number of clusters
    • an initial guess with regard to cluster centroids This produces…
    • Final estimate of cluster centroids
    • an assignment of each point to clusters

Ex. K-means clustering

set.seed(1234)
par(mar = c(0, 0, 0, 0))
x <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
y <- rnorm(12, mean = rep(c(1, 2, 1), each = 4), sd = 0.2)
plot(x, y, col = "blue", pch = 19, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:12))

dataFrame <- data.frame(x, y)
kmeansObj <- kmeans(dataFrame, centers = 3)

# plot kmeans()
par(mar = rep(0.2, 4))
plot(x, y, col = kmeansObj$cluster, pch = 19, cex = 2)
points(kmeansObj$centers, col = 1:3, pch = 3, cex = 3, lwd = 3)

Ex. with heatmap

set.seed(1234)
dataMatrix <- as.matrix(dataFrame)[sample(1:12), ]
kmeansObj2 <- kmeans(dataMatrix, centers = 3)
par(mfrow = c(1, 2), mar = c(2, 4, 0.1, 0.1))
image(t(dataMatrix)[, nrow(dataMatrix):1], yaxt = "n")
image(t(dataMatrix)[, order(kmeansObj$cluster)], yaxt = "n")

Principle components analysis and singular value decomposition

SVD

  • if X is a matrix with each variable in a column and each observation in a row, then the SVD is a “matrix decomposition”
    X = UDV^T
  • …where the columns of U are orthogonal (left singular vectors), the columns of V are orthogonal (right singular vectors) and D is a diagonal matrix (singular values).

PCA

  • The principle components are equal to the right singular values if you first scale (subtract the mean, divide by the SD) the variables.

Ex. SVD

set.seed(12345)
par(mar = rep(0.2, 4))
dataMatrix <- matrix(rnorm(400), nrow = 40)
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])

par(mar = rep(0.2, 4))
heatmap(dataMatrix)

# add a pattern
set.seed(678910)
for (i in 1:40) {
    # flip a coin
    coinflip <- rbinom(1, size = 1, prob = 0.5)
    # if coin is heads add a common pattern to that row
    if (coinflip) {
        dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 3), each = 5)
    }
}

par(mar = rep(0.2, 4))
image(1:10, 1:40, t(dataMatrix)[, nrow(dataMatrix):1])

heatmap(dataMatrix)

hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rowMeans(dataMatrixOrdered), 40:1, xlab = "Row means", ylab = "Row", pch = 19)
plot(colMeans(dataMatrixOrdered), xlab = "Column", ylab = "Column mean", pch = 19)

svd1 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd1$u[, 1], 40:1, xlab = "Row", ylab = "First left singular vector", pch = 19)
plot(svd1$v[, 1], xlab = "Column", ylab = "First right singular vector", pch = 19)

Ex. Components of the SVD - variance explained

par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singular value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Prop. of variance explained", pch = 19)

Ex. Relationship to principal components

svd1 <- svd(scale(dataMatrixOrdered))
pca1 <- prcomp(dataMatrixOrdered, scale = TRUE)
plot(pca1$rotation[, 1], svd1$v[, 1], pch = 19, xlab = "Principal Component 1", ylab = "Right singular vector 1")
abline(c(0, 1))

Ex. More variance explained

constantMatrix <- dataMatrixOrdered*0
for(i in 1:dim(dataMatrixOrdered)[1]){constantMatrix[i, ] <- rep(c(0, 1), each = 5)}
svd1 <- svd(constantMatrix)
par(mfrow = c(1, 3))
image(t(constantMatrix)[, nrow(constantMatrix):1])
plot(svd1$d, xlab = "Column", ylab = "Singular value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Prop. of variance explained", pch = 19)

Ex. Add a second pattern

set.seed(678910)
for (i in 1:40) {
    # flip a coin
    coinflip1 <- rbinom(1, size = 1, prob = 0.5)
    coinflip2 <- rbinom(1, size = 1, prob = 0.5)
    # if coin is heads add a common pattern to that row
    if (coinflip1) {
        dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), each = 5)
    }
    if (coinflip2) {
        dataMatrix[i, ] <- dataMatrix[i, ] + rep(c(0, 5), 5)
    }
}
hh <- hclust(dist(dataMatrix))
dataMatrixOrdered <- dataMatrix[hh$order, ]

Ex. True patterns

svd2 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(rep(c(0, 1), each = 5), pch = 19, xlab = "Column", ylab = "Pattern 1")
plot(rep(c(0, 1), 5), pch = 19, xlab = "Column", ylab = "Pattern 2")

svd2 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 3))
image(t(dataMatrixOrdered)[, nrow(dataMatrixOrdered):1])
plot(svd2$v[, 1], pch = 19, xlab = "Column", ylab = "First right singular vector")
plot(svd2$v[, 2], pch = 19, xlab = "Column", ylab = "second right singular vector")

svd1 <- svd(scale(dataMatrixOrdered))
par(mfrow = c(1, 2))
plot(svd1$d, xlab = "Column", ylab = "Singular value", pch = 19)
plot(svd1$d^2/sum(svd1$d^2), xlab = "Column", ylab = "Prop. of variance explained", pch = 19)

Missing values

  • Cannot SVD with missing data
  • Solution: impute package
dataMatrix2 <- dataMatrixOrdered
## Randomly insert some missing data
dataMatrix2[sample(1:100, size = 40, replace = FALSE)] <- NA

library(impute) # not installed
dataMatrix2 <- impute.knn(dataMatrix2)$data
svd1 <- svd(scale(dataMatrixOrdered)); svd2 <- svd(scale(dataMatrix2))
par(mfrow = c(1, 2)); plot(svd1$v[, 1], pch = 19); plot(svd2$v[, 1], pch = 19)

Reproducible research

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.


Statistical inference

Probability

  • Given a random experiment, a probability measure is a population quantity that summarizes the randomness
  • Probability takes a possible outcome form the experiment advance
    • assigns it a number between 0 and 1
    • probability that something occurs is 1 and the probability of the union of any two sets of outcomes that have nothing in common is the sum of their respective probabilities
  • Rules
    • Probability that nothing occurs is 0
    • Probability that something occurs is 1
    • Probability of something is 1 minus the prob that the opposite occurs
    • Probability of at least one of two things that are mutually exclusive is the sum of their respective probabilities
    • If an event A implies the occurence of event B, then the probability of A occurring is less than the probability that B occurs
    • For any two events the probability that at least one occurs is the sum of their probabilities minus their intersection
  • Random variables
    • numerical outcome of an experiment
    • discrete: count variables (hits on a webpage), we talk about the probability that they take specific values
    • continuous: can take any value in a continuum, we talk about the probability that they fall within some range
  • Probability Mass Function (PMF)
    • evaluated at a certain value, the PMF corresonds to the probability that a random variable takes said value
    • To be valid PMF, a function (p) must satisfy:
      1. It must always be larger than or equal to 0
      2. The sum of the possible values that the random variable can take has to add up to 1
  • Probability Density Function (PDF)
    • PDF is a function associated with a continous random variable
    • Areas under PDFs crrespond to probabilities for that random variable
    • To be a valid PDF, a function (p) must satisfy:
      1. It must be larger than or equal to zero everywhere
      2. The total area under it must be 1
  • Cumulative Distribution Function (CDF)
    • This is a specific area of the PDF, which it very useful
    • The CDF of a random variable, X, returns the probability that the random variable is les than or equal to the value of x.
    • This is for both discrete and continuous random variables
    • F(x) = P(X <= x)
  • Survival Function
    • The SF of a random variable X is defined as the probability that the random variable is greater than the value x
    • S(x) = P(X > x)
  • Quantiles

Conditional probability

  • Given new information, you can change/update the probability of a certain event occurrin
  • Conditional on the new information, the probability can change
  • Baye’s Rules
    • We can reverse the conditioning set provided that we know some marginal probabilities
  • Diagnostic tests
    • sensitivity: probability the test is positive given that ‘the subject acutually has the disease’
    • specificity: probability the test is negative given that ‘the subject does not actually have the disease’’
    • positive predictive value: probability that the subject has the disease given that the test is poisitive P(D|+)
    • negative predictive value: probability that the subject does not have the disease given that the test is negative P(Dc|-)
    • prevalence: the marginal probability of disease, P(D)
    • diagnostic likelihood ratio of positive test: sensitivity/(1-specificity)
    • diagnostic likelihood ratio of negative test: (1-sensitiviy)/specificity
  • The odds associated with a probability, p, are defined as: p/1-p
  • IID random variables
    • random variables are said to be IID if they are independent and identically distributed
      • independent: statistically unrelated from one another
      • identically distributted: all drawn from the same pop. distribution

Expectations

  • Expected values
    • mean: center of distributuion
    • variance: how spread out the distributuion is
    • standard deviation: how spread out the distributuion is
    • sample expected values estimate the population values

Confidence intervals

  • t-intervals
    • assumes data are IID, but not extremely important
    • paired obs. can be analyzed using the t-interval and taking differences
    • t and standard normal become more similar with high DFs (CLT)
  • Sleep data examples
data(sleep); head(sleep)
##   extra group ID
## 1   0.7     1  1
## 2  -1.6     1  2
## 3  -0.2     1  3
## 4  -1.2     1  4
## 5  -0.1     1  5
## 6   3.4     1  6
g1 <- sleep$extra[1:10]; g2 <- sleep$extra[11:20]
difference <- g2 - g1
mn <- mean(difference); s <- sd(difference); n <- 10

# confidence interval
mn + c(-1, 1) * qt(.975, n-1) * s / sqrt(n)
## [1] 0.7001142 2.4598858
t.test(difference)
## 
##  One Sample t-test
## 
## data:  difference
## t = 4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.7001142 2.4598858
## sample estimates:
## mean of x 
##      1.58
t.test(g2, g1, paired = TRUE)
## 
##  Paired t-test
## 
## data:  g2 and g1
## t = 4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.7001142 2.4598858
## sample estimates:
## mean of the differences 
##                    1.58
t.test(extra ~ I(relevel(group, 2)), paired = TRUE, data = sleep)
## 
##  Paired t-test
## 
## data:  extra by I(relevel(group, 2))
## t = 4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.7001142 2.4598858
## sample estimates:
## mean of the differences 
##                    1.58
  • All give the same results
  • If subjects of two groups cannot be matched, then you cannot pair the two groups (paired = FALSE)
  • Independent group examples
# unpaired, assumed = variance
# x_oc = 132.86 sd = 15.34, n = 8
# x_c  = 127.44 sd = 18.23, n = 21

# sd of variance
sp <- sqrt((7 * 15.34^2 + 20 * 18.23) / (8 + 21 - 2))

# interval
132.86 - 127.44 + c(-1, 1) * qt(.975, 27) * sp * (1 / 8 + 1 / 21)^.5
## [1] -1.938637 12.778637
  • Generic formulas

t.test(y ~ x, paired = TRUE, var.equal = TRUE, data = df)$conf t.test(y ~ x, paired = FALSE)$conf

  • When in doubt, assume unequal variance

Confidence intervals (examples)

  • We can calculate a 95% confidence interval using the t.test() function in conjuntion with $conf.int
library(datasets); data(mtcars)
round(t.test(mtcars$mpg)$conf.int)
## [1] 18 22
## attr(,"conf.level")
## [1] 0.95
  • The t-interval is: x_ ± t_.975, df=8 * s/sqrt(n)
  • This can be used to calculate the average distance between paired samples
    • Ex. SD of 9 paired differences is 1, what values would the average difference have to be so taht the lower endpoint of a 95% students t confidence interval touch zero?
round(qt(.975, df = 8) * 30 / 3, 2)
## [1] 23.06
  • Consider the mtcars dataset. Construct a 95% T interval for MPG comparing 4 to 6 cylinder cars (subtracting in the order of 4 - 6) assume a constant variance.
m4 <- mtcars$mpg[mtcars$cyl == 4]
m6 <- mtcars$mpg[mtcars$cyl == 6]
#this does 4 - 6
as.vector(t.test(m4, m6, var.equal = TRUE)$conf.int)
## [1]  3.154286 10.687272
  • Suppose that 18 obese subjects were randomized, 9 each, to a new diet pill and a placebo. Subjects body mass indices (BMIs) were measured at a baseline and again after having received the treatment or placebo for four weeks. The average difference from follow-up to the baseline (followup - baseline) was 3 kg/m2 for the treated group and 1 kg/m2 for the placebo group. The corresponding standard deviations of the differences was 1.5 kg/m2 for the treatment group and 1.8 kg/m2 for the placebo group. The study aims to answer whether the change in BMI over the four week period appear to differ between the treated and placebo groups.
n1 <- n2 <- 9
x1 <- -3  ##treated
x2 <- 1  ##placebo
s1 <- 1.5  ##treated
s2 <- 1.8  ##placebo
spsq <- ( (n1 - 1) * s1^2 + (n2 - 1) * s2^2) / (n1 + n2 - 2)

Power

  • Power: the probability of rejecting the null hypothesis when in fact the null hypothesis is FALSE
  • Type II error: failing to reject the null hypothesis when it is false
  • Power = 1 - \(\beta\)
  • As alpha gets larger, power gets larger
  • Power of one sided test is greater than power of the associated two sided test
  • Further MUa gets from MU0, power gets larger
  • An N gets larger, power gets larger
  • Power doesnt need MUa, sigma and N, only sqrt(n)(MUa - MU0)/sigma
  • (MUa - MU0) / sigma = effect size
  • example calculating power, sample size and minimal detectable differnce
# Equivalent calculations of power using N, delta, and SD
power.t.test(n = 16, delta = 2/4, type = 'one.sample', alt = 'one.sided')$power
## [1] 0.6040329
power.t.test(n = 16, delta = 2, sd = 4, type = 'one.sample', alt = 'one.sided')$power
## [1] 0.6040329
power.t.test(n = 16, delta = 100, sd = 200, type = "one.sample", alt = 'one.sided')$power
## [1] 0.6040329
# Equivalent calculations of sample size using power, delta and SD
power.t.test(power = 0.8, delta = 2/4, sd = 1, type = 'one.sample', alt = 'one.sided')$n
## [1] 26.13751
power.t.test(power = 0.8, delta = 2, sd = 4, type = 'one.sample', alt = 'one.sided')$n
## [1] 26.13751
power.t.test(power = 0.8, delta = 100, sd = 200, type = 'one.sample', alt = 'one.sided')$n
## [1] 26.13751

Multiple testing

  • significance test are overused
  • correcting for multiple testing avoids false positives
  • Keys
    • error measure
    • correction (m = number of comparisons)
  • Bonferroni correction
    • \(\alpha\) / m
    • highly conservative
  • False discovery rate (FDR, Benjamini Hockberg)
    • order p-values from smallest to largest
    • check to see if each values is less than alpha * ith/m
    • PRO: easy to calculate, less conservative
    • CON: allows more false positives, ebhaves strangely under dependence (hypoth tests on similar data)
  • Adjusted p-values
    • not p-values anymore (now have different properties)
    • can calculate error measures without changing alpha
    • multiple p-values by m

Resampled inference

  • The bootstrap
    • good for creating confidence intervals
    • Bootstrap principle: use observed data to construct an estimated population distribution
    • “The most common form of the bootstrap, the non-parametric bootstrap, places probability 1/n on each data point.”
library(UsingR); data(father.son)
## Loading required package: MASS
## Loading required package: HistData
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.1.2
## Loading required package: grid
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
## 
## Loading required package: quantreg
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## 
## Attaching package: 'quantreg'
## 
## The following object is masked from 'package:Hmisc':
## 
##     latex
## 
## The following object is masked from 'package:survival':
## 
##     untangle.specials
## 
## 
## Attaching package: 'UsingR'
## 
## The following object is masked from 'package:survival':
## 
##     cancer
## 
## The following object is masked from 'package:ggplot2':
## 
##     movies
x <- father.son$sheight
n <- length(x)
B <- 10000
resamples <- matrix(sample(x, n * B, replace = TRUE), B, n)
resampledMedians <- apply(resamples, 1, median)
sd(resampledMedians)
## [1] 0.08370475
quantile(resampledMedians, c(0.025, 0.975))
##     2.5%    97.5% 
## 68.43733 68.81461
g <- ggplot(data.frame(resampledMedians = resampledMedians), aes(x = resampledMedians))
g + geom_histogram(color = 'black', fill = 'lightblue', binwidth = 0.05)

  • This is a non-parametric bootstrap
  • Bias corrected and accelerated interval (bootstrap package in R)

Permutation tests

  • Permutation tests permute the predictor variable to obtain a null distribution.
  • This is another type of hypothesis test
subdata <- InsectSprays[InsectSprays$spray %in% c("B", "C"), ]
y <- subdata$count
group <- as.character(subdata$spray)
testStat <- function(w, g) mean(w[g == "B"]) - mean(w[g == "C"])
observedStat <- testStat(y, group)
permutations <- sapply(1:10000, function(i) testStat(y, sample(group)))
observedStat
## [1] 13.25
mean(permutations > observedStat)
## [1] 0

Regression models

library(UsingR); data(galton)
par(mfrow = c(1, 2))
hist(galton$child, col = "blue", breaks = 100)
hist(galton$parent, col = "blue", breaks = 100)

library(manipulate)
myHist <- function(mu){
  hist(galton$child,col="blue",breaks=100)
  lines(c(mu, mu), c(0, 150),col="red",lwd=5)
  mse <- mean((galton$child - mu)^2)
  text(63, 150, paste("mu = ", mu))
  text(63, 140, paste("MSE = ", round(mse, 2)))
}
manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))
plot(galton$parent,galton$child,col="blue")

freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
plot(as.numeric(as.vector(freqData$parent)), 
     as.numeric(as.vector(freqData$child)),
     pch = 21, col = "black", bg = "lightblue",
     cex = .15 * freqData$freq, 
     xlab = "parent", ylab = "child")

myPlot <- function(beta){
  y <- galton$child - mean(galton$child)
  x <- galton$parent - mean(galton$parent)
  freqData <- as.data.frame(table(x, y))
  names(freqData) <- c("child", "parent", "freq")
  plot(
    as.numeric(as.vector(freqData$parent)), 
    as.numeric(as.vector(freqData$child)),
    pch = 21, col = "black", bg = "lightblue",
    cex = .15 * freqData$freq, 
    xlab = "parent", 
    ylab = "child"
    )
  abline(0, beta, lwd = 3)
  points(0, 0, cex = 2, pch = 19)
  mse <- mean( (y - beta * x)^2 )
  title(paste("beta = ", beta, "mse = ", round(mse, 3)))
}
manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02))

Notation

  • Greek letters to represent things we don’t know (i.e. things we want to estimate)
  • Empiracle mean (central tendency)
    \[ \bar X = \frac{1}{n}\sum_{i=1}^n X_i. \]

  • If we subtract the mean of the data from each data point, we get ‘new’ data with mean 0
  • This is called centering
  • The mean is the least squares solution for minimzing
    \[ \sum_{i=1}^n (X_i - \mu)^2 \]

  • The data defined by Xi/s have empirical standard deviation (measure of spread) of 1
  • This is called scaling the data

  • normalization: centering and scaling the data (empirical mean = 0, empirical SD = 1)
  • Empiracle covariance:

General least squares for linear equations

  • Fitting the best line
    • ‘chuck a spear through it’ in a way that minimizes the squared values (the best ‘mu i hat’)
  • Mean only Regression
    • if we only look at horizontal lines, the least squares estimate of the intercept of that line is the average of the outcomes (the grand mean)
    • if we only look at lines through the origin we get the estimated slope
    • we want both!
  • y = a + bx
    • line passes through (x-, y-)
    • the slope is the correlation of (y,x) times the sd(y)/sd(x)
    • The slope is the same as if you centered the data and did regression through the origin
    • if you normalize the data, the slope is corr(y, x)
  • Lets check…
y <- galton$child
x <- galton$parent
beta1 <- cor(y, x) * sd(y) / sd(x)
beta0 <- mean(y) - beta1 * mean(x)
rbind(c(beta0, beta1), coef(lm(y ~ x)))
##      (Intercept)         x
## [1,]    23.94153 0.6462906
## [2,]    23.94153 0.6462906
  • Regression through the origin yields an equivalent slope if you center the data first
yc <- y - mean(y)
xc <- x - mean(x)
beta1 <- sum(yc * xc) / sum(xc^2)
c(beta1, coef(lm(y ~ x))[2])
##                   x 
## 0.6462906 0.6462906

Regression to the mean

  • Fun Examples
library(UsingR)
data(father.son)
y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight)
x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight)
rho <- cor(x, y)
myPlot <- function(x, y) {
  plot(x, y, 
       xlab = "Father's height, normalized",
       ylab = "Son's height, normalized",
       xlim = c(-3, 3), ylim = c(-3, 3),
       bg = "lightblue", col = "black", cex = 1.1, pch = 21, 
       frame = FALSE)
}

myPlot(x, y)
abline(0, 1) # if there were perfect correlation
abline(0, rho, lwd = 2) # father predicts son
abline(0, 1 / rho, lwd = 2) # son predicts father, son on vertical axis
abline(h = 0); abline(v = 0) # reference lines for no relathionship

Diamond Examples

library(UsingR); data (diamond)
with(diamond, plot(carat, price, 
    frame = FALSE,
    xlab = "Mass (carats)", 
    ylab = "Price (SIN $)",
    bg = "lightblue",
    col = "black", cex = 1.1, pch = 21))
abline(lm(price ~ carat, data = diamond), lwd = 2)

Check parameters

fit <- lm(price ~ carat, data = diamond)
coef(fit)
## (Intercept)       carat 
##   -259.6259   3721.0249

Make the intercept more interpretable by subtracting the mean of the predictor from the predictor (centering).

fit2 <- lm(price ~ I(carat - mean(carat)), data = diamond)
coef(fit2)
##            (Intercept) I(carat - mean(carat)) 
##               500.0833              3721.0249

Changing scale to make slope more interpretable: change 1 carat to 1/10th of a carat by dividing the coeficient by 10 (or multiplying the predictor by 10 in the model formula)

fit3 <- lm(price ~ I(carat * 10), data = diamond)
coef(fit3)
##   (Intercept) I(carat * 10) 
##     -259.6259      372.1025

Making predictions of the price of diamonds (select a value for carats and multiple it by the slope and add the intercept)

newx <- c(0.16, 0.27, 0.34)
coef(fit)[1] + coef(fit)[2] * newx
## [1]  335.7381  745.0508 1005.5225

This can also be accomplished automatically withthe predict() function (note: the new data you want to predict must be in a data frame and the variable must have the same name as the predictor in the model)

predict(fit, newdata = data.frame(carat = newx))
##         1         2         3 
##  335.7381  745.0508 1005.5225

Residuals

  • The distance between the observed data point and the regression line
  • Yi - Yi^ = ei (e_i_ = residuals)
  • Least squares estimates minimizes e_i_
  • The e_i_ is an estimate of E_i_

Properties of resuduals

  • Expected value is 0
  • If intercept is included, sum of residuals in 0
    • this means they cannot be independent
  • Residuals are used for analyzing model fit
  • Residuals are the outcome (y) with the linear association of the predictor (x) removed
    • ‘y adjusted for the linear association of predictor x’
  • We can differentiate residual variation (variation after removing the predictor) from systematic variation (variation explained by the regression model)

Diamond Examples

  • The residuals are the difference between the observed and fitted values
y <- diamond$price; x <- diamond$carat; n <- length(y)
fit <- lm(y ~ x)
e <- resid(fit)
yhat <- predict(fit)
max(abs(e - (y - yhat)))
## [1] 9.485746e-13
  • We can also calculate this directly
max(abs(e - (y - coef(fit)[1] - coef(fit)[2] * diamond$carat)))
## [1] 9.485746e-13
  • It is a good idea to plot this
plot(x, resid(fit), frame = FALSE)
abline(h = 0)

  • What if it is not a linear fit? How do we know?
x <- runif(100, -3, 3); y <- x + sin(x) + rnorm(100, sd = .2);
par(mfrow = c(1, 2))
plot(x, y); abline(lm(y ~ x))
plot(x, resid(lm(y ~ x))); abline(h = 0)

  • Heteroskedasticity (variance is not constant at the function of x)
x <- runif(100, 0, 6); y <- x + rnorm(100, mean = 0, sd = 0.001 * x);
par(mfrow = c(1, 2))
plot(x, y); abline(lm(y ~ x))
plot(x, resid(lm(y ~ x))); abline(h = 0)

Estimating residual variation

  • We can get a residual variance estimate from summary() function
summary(fit)$sigma
## [1] 31.84052
  • Summarizing variation:
    • Total variation = residual variation + regression variation
    • The percent of total variation that is explained by the model is R2
      • has to be between 0 and 1
      • is the simple correlation (r) squared
      • can be misleading summary of model fit
      • these plots all have the same R2

Inference in regression

  • We can calculate everything by hand
y <- diamond$price; x <- diamond$carat; n <- length(y)
beta1 <- cor(y, x) * sd(y) / sd(x)
beta0 <- mean(y) - beta1 * mean(x)
e <- y - beta0 - beta1 * x
sigma <- sqrt(sum(e^2) / (n-2))
ssx <- sum((x - mean(x))^2) 
seBeta0 <- (1/n + mean(x)^2 / ssx)^.5 * sigma
seBeta1 <- sigma / sqrt(ssx)
tBeta0 <- beta0/seBeta0; tBeta1 <- beta1 / seBeta1
pBeta0 <- 2 * pt(abs(tBeta0), df = n - 2, lower.tail = FALSE)
pBeta1 <- 2 * pt(abs(tBeta1), df = n - 2, lower.tail = FALSE)
coefTable <- rbind(c(beta0, seBeta0, tBeta0, pBeta0), c(beta1, seBeta1, tBeta1, pBeta1))
colnames(coefTable) <- c("Estimate", "Std. error", "t value", "p(>|t|)")
rownames(coefTable) <- c("(Intercept)", "x")
print(coefTable)
##              Estimate Std. error   t value      p(>|t|)
## (Intercept) -259.6259   17.31886 -14.99094 2.523271e-19
## x           3721.0249   81.78588  45.49715 6.751260e-40
  • But we proabably don’t want to
  • Compare with
summary(fit)$coefficients
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) -259.6259   17.31886 -14.99094 2.523271e-19
## x           3721.0249   81.78588  45.49715 6.751260e-40
  • Getting a confidence interval (slope)
sumCoef <- summary(fit)$coefficients
sumCoef[1, 1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[1, 2]
## [1] -294.4870 -224.7649
  • Getting a confidence interval (intercept)
sumCoef[2, 1] + c(-1, 1) * qt(.975, df = fit$df) * sumCoef[2, 2]
## [1] 3556.398 3885.651
  • Plotting a prediction interval (the hard way)
plot(x, y, frame = FALSE, xlab = "", ylab = "", 
      pch = 21, col = "black", bg = "lightblue",
      cex = 2)
abline(fit, lwd = 2)
xVals <- seq(min(x), max(x), by = 0.01)
yVals <- beta0 + beta1 * xVals
se1 <- sigma * sqrt(1 / n + (xVals - mean(x))^2/ssx)
se2 <- sigma * sqrt(1 + 1 / n + (xVals - mean(x))^2/ssx)
lines(xVals, yVals + 2 * se1)
lines(xVals, yVals - 2 * se1)
lines(xVals, yVals + 2 * se2)
lines(xVals, yVals - 2 * se2)

  • Plotting a prediction interval (in R)
newdata <- data.frame(x = xVals)
p1 <- predict(fit, newdata, interval = ("confidence"))
p2 <- predict(fit, newdata, interval = ("prediction"))
plot(x, y, frame = FALSE, pch = 21, col = "black", bg = "lightblue", cex = 2)
abline(fit, lwd = 2)
lines(xVals, p1[, 2]); lines(xVals, p1[, 3])
lines(xVals, p2[, 2]); lines(xVals, p2[, 3])

Multivariate regression

  • General linear model extends simple linear regression (SLR) by adding terms linearly into the model
  • Multivariate regression estimates are exactly those having removed the linear relationsip of the other variables from both the regressor and response
  • The least squares estimate for the coefficient of a multivariate regression model is exactly regression through the origin with the linear relationships with the other regressors removed from both the regressor and outcome by taking the residuals.
  • Multivariate regression “adjusts” a coefficient for the linear impact of the other variables.

Interpreting coefficients

  • The interpretation of a multivariate regression coefficient is the expected change in the response per unit change in the regressor, holding all of the other regressors fixed

Dummy coding

  • If an intercept in included in the model, R will take the first level of the variable (alphabetically) as the reference
    • t-tests are for comparisons of each level vs. the reference level
    • the empirical mean for the reference level is the intercept
    • other group means are the reference level mean plus the coefficient
  • If the intercept is omitted, R includes terms for all levels of the factor
    • Group means are the coefficients
    • t-tests are for whether the groups are different from zero
  • You can choose the reference level of the factor
    • `relevel(df$factor, “newRefLevel”)

Interactions

  • Simulation example
set.seed(1)

n <- 100; x <- runif(n)
z <- rep(c(0, 1), c(n/2, n/2))

#Int        #Slope      #Trtmnt   #Var
beta0 <- 0; beta1 <- 2; tau <- 1; sigma <- 0.2

# y  = B0 + B1Xi + B2ti + E
# y  = B0    + B1        * Xi        + B2          * ti        + Ei
# y  = slope + regressor * slop coef + trtm effect * tef param + E
y <-   beta0 + x         * beta1     + z           * tau       + rnorm(n, sd = sigma)
mod <- lm(y ~ x * z)

plot(x, y, frame = F)
abline(c(mod$coeff[1], mod$coeff[2]), col = 'black')
abline(c(mod$coeff[1] + mod$coeff[3], 
       mod$coeff[2] + mod$coeff[4]), col = 'black')
# points(x, y, pch = 21, col = ((z == "0")*1+1), bg = 'red')
points(x, y, pch = 21, col = ((z == "1")*1+1), bg = 'blue')
abline(a = mean(y), b = 0)
abline(v = mean(x))

Residuals, diagnostics, variation

  • Influential measures: r has many
    • rstandard: standardized residuals (resid / sd)
    • rstudent: standardized residuals (ith data point deleted to follow t-distribution)
    • hatvalues: measures of leverage round(hatvalues(model)[1:10], 3)
    • dffits: change in the predicted resonse when the ith point is deleted in fitting the model
    • dfbetas: change in ind. coeff when the ith point is delted in fitting hte model round(dfbetas(model)[1:10], 3)
    • cooks.distance: overall change in the coeff. when the ith point in deleted
    • resid: resturns ordinary residuals
    • resid(fit) / (1 - hatvalues(fit)): PRESS residuals, the ‘leave out one cross validation residuals’, the diff. in the respoinse and the predicted response at data point i, where it was not included in the model fitting

Multiple variables

  • Modeling vs prediction
    • model is a lense throughwhich we look at data
    • no correct model
    • prediction has separate set of criteria
    • modeling is more concerned with parsimonious, interpretable representations fo the data that enhance our understanding of the phenomena under study
  • Known knowns: regressors that we know we should check to include in the model
  • Known unknowns: regressors we would like to include but dont have
  • Unknown unknowns: regressors we dont know about that we should have included in the model
  • Variable inclusion
    • including irrelevant variables increases standard errors of regressor variables
  • Variable exclusion
    • results in bias, unless the omitted variables are uncorrelated with the predictors used
    • this is why we randomize, even so, sometimes there are unobserved confounding variables

Model comparison

  • Variance inflation
require(stats); data(swiss)
fit1 <- lm(Fertility ~ Agriculture, data = swiss)
a <- summary(fit1)$cov.unscaled[2, 2]
fit2 <- update(fit1, Fertility ~ Agriculture + Examination)
fit3 <- update(fit1, Fertility ~ Agriculture + Examination + Education)
c(summary(fit2)$cov.unscaled[2, 2],
  summary(fit3)$cov.unscaled[2, 2]) / a
## [1] 1.891576 2.089159
# vif(fit1)
# sqrt(vif(fit1))
anova(fit1, fit2, fit3)
## Analysis of Variance Table
## 
## Model 1: Fertility ~ Agriculture
## Model 2: Fertility ~ Agriculture + Examination
## Model 3: Fertility ~ Agriculture + Examination + Education
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     45 6283.1                                  
## 2     44 4072.7  1   2210.38 29.880 2.172e-06 ***
## 3     43 3180.9  1    891.81 12.056  0.001189 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Generalized linear models

  • 3 parts
    • An exponential family model for the response
    • A systematic component via a linear predictor
    • A link funtion that connects the mean of the response to the linear predictor
  • Interpreting odds ratios
    • not probabilities
    • odd ratio of 1 = no difference in odds
    • log odds ratio of 0 = no difference in odds
    • odds ratio < 0.5 or > 2 commonly a moderate effect

Count outcomes, Poisson GLMs

  • Key ideas

Machine learning

The carat package

  • Useful for prediction algorythims
  • Functionality
    • preprocessing
    • data splitting
    • training/testing functions
    • model comparison
  • There are many types of machine learning algorhytms
library(caret); library(kernlab); data(spam)
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:survival':
## 
##     cluster
inTrain <- createDataPartition(y = spam$type, p = 0.75, list = FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
dim(training)
## [1] 3451   58
set.seed(32343)
modelFit <- train(type ~ ., data = training, method = "glm")
modelFit
## Generalized Linear Model 
## 
## 3451 samples
##   57 predictor
##    2 classes: 'nonspam', 'spam' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
## 
## Resampling results
## 
##   Accuracy   Kappa      Accuracy SD  Kappa SD  
##   0.9173753  0.8268281  0.01276529   0.02453941
## 
## 
modelFit$finalModel
## 
## Call:  NULL
## 
## Coefficients:
##       (Intercept)               make            address  
##        -1.681e+00         -5.893e-01         -1.349e-01  
##               all              num3d                our  
##         4.432e-02          3.478e+00          4.483e-01  
##              over             remove           internet  
##         1.187e+00          2.056e+00          5.285e-01  
##             order               mail            receive  
##         7.244e-01          8.545e-02         -1.999e-02  
##              will             people             report  
##        -1.067e-01          1.173e-01          5.179e-02  
##         addresses               free           business  
##         7.939e-01          1.102e+00          9.371e-01  
##             email                you             credit  
##         1.846e-01          1.112e-01          8.711e-01  
##              your               font             num000  
##         2.595e-01          1.058e-01          2.597e+00  
##             money                 hp                hpl  
##         3.641e-01         -2.005e+00         -9.171e-01  
##            george             num650                lab  
##        -1.272e+01          3.517e-01         -2.116e+00  
##              labs             telnet             num857  
##        -1.082e+00         -1.238e-01          1.419e+00  
##              data             num415              num85  
##        -5.260e-01          1.513e+00         -1.682e+00  
##        technology            num1999              parts  
##         1.005e+00          1.898e-01         -6.596e-01  
##                pm             direct                 cs  
##        -8.101e-01         -2.300e-01         -5.800e+02  
##           meeting           original            project  
##        -3.177e+00         -2.466e+00         -1.639e+00  
##                re                edu              table  
##        -7.808e-01         -1.829e+00         -1.740e+00  
##        conference      charSemicolon   charRoundbracket  
##        -3.539e+00         -1.178e+00         -1.099e-01  
## charSquarebracket    charExclamation         charDollar  
##        -5.028e-01          2.283e-01          5.702e+00  
##          charHash         capitalAve        capitalLong  
##         3.711e+00          8.134e-03          1.029e-02  
##      capitalTotal  
##         9.172e-04  
## 
## Degrees of Freedom: 3450 Total (i.e. Null);  3393 Residual
## Null Deviance:       4628 
## Residual Deviance: 1321  AIC: 1437
  • The above come uses the caret package and the spam data from the kernlab package
  • It creates a subset of the data (75%) which uses spam type as the dv
  • we then create a training and testing data set
  • and fit a model (GLM) in which spam type (spam or non-spam) is predicted by everyting else (57 predictors)
  • model\(fit gives an output of what was done (bootstrap sampling with correction) and modelFit\)finalModel shows the coefficients
  • Now we will predict on new samples
predictions <- predict(modelFit, newdata = testing)
predictions
##    [1] spam    spam    spam    spam    spam    spam    spam    spam   
##    [9] spam    spam    spam    spam    spam    nonspam spam    nonspam
##   [17] spam    spam    spam    spam    spam    nonspam spam    spam   
##   [25] nonspam spam    spam    spam    spam    spam    spam    spam   
##   [33] spam    spam    spam    spam    spam    nonspam spam    spam   
##   [41] spam    spam    spam    nonspam nonspam spam    spam    spam   
##   [49] spam    spam    spam    nonspam spam    nonspam spam    nonspam
##   [57] spam    spam    spam    nonspam spam    spam    spam    spam   
##   [65] spam    spam    spam    spam    spam    spam    spam    spam   
##   [73] spam    nonspam nonspam spam    spam    nonspam spam    nonspam
##   [81] spam    spam    nonspam spam    nonspam spam    spam    spam   
##   [89] spam    spam    spam    spam    spam    spam    spam    spam   
##   [97] spam    spam    spam    spam    nonspam spam    spam    spam   
##  [105] spam    spam    spam    spam    spam    spam    spam    spam   
##  [113] nonspam spam    spam    spam    spam    nonspam spam    spam   
##  [121] spam    spam    spam    spam    spam    nonspam spam    spam   
##  [129] spam    spam    spam    spam    spam    spam    spam    nonspam
##  [137] spam    spam    spam    spam    spam    spam    spam    spam   
##  [145] spam    spam    spam    spam    spam    spam    spam    spam   
##  [153] spam    nonspam spam    spam    spam    spam    spam    nonspam
##  [161] spam    spam    spam    spam    spam    spam    spam    spam   
##  [169] spam    spam    spam    spam    spam    spam    spam    nonspam
##  [177] spam    spam    spam    spam    spam    spam    spam    spam   
##  [185] nonspam spam    spam    spam    spam    spam    spam    nonspam
##  [193] spam    spam    spam    spam    spam    spam    spam    spam   
##  [201] spam    spam    spam    spam    nonspam spam    spam    spam   
##  [209] spam    nonspam spam    spam    spam    spam    spam    nonspam
##  [217] spam    spam    nonspam spam    spam    spam    spam    spam   
##  [225] spam    spam    spam    spam    spam    spam    nonspam nonspam
##  [233] nonspam spam    spam    spam    spam    spam    spam    spam   
##  [241] spam    spam    spam    spam    nonspam spam    spam    spam   
##  [249] spam    spam    spam    spam    spam    nonspam spam    spam   
##  [257] spam    spam    spam    spam    spam    spam    spam    spam   
##  [265] spam    spam    spam    spam    spam    spam    spam    spam   
##  [273] spam    spam    nonspam spam    spam    spam    spam    spam   
##  [281] spam    spam    spam    spam    spam    spam    spam    spam   
##  [289] spam    spam    spam    spam    spam    spam    nonspam spam   
##  [297] spam    spam    spam    spam    spam    spam    spam    spam   
##  [305] spam    spam    nonspam spam    spam    spam    nonspam spam   
##  [313] spam    spam    spam    spam    spam    spam    nonspam spam   
##  [321] spam    spam    spam    spam    spam    spam    spam    spam   
##  [329] spam    spam    spam    spam    spam    spam    spam    spam   
##  [337] spam    spam    spam    spam    spam    spam    nonspam spam   
##  [345] spam    spam    spam    spam    spam    nonspam spam    nonspam
##  [353] spam    spam    spam    spam    spam    spam    spam    nonspam
##  [361] nonspam spam    spam    spam    spam    spam    spam    spam   
##  [369] spam    nonspam spam    spam    spam    spam    spam    spam   
##  [377] nonspam spam    spam    spam    nonspam spam    spam    spam   
##  [385] nonspam spam    spam    spam    nonspam nonspam nonspam spam   
##  [393] spam    spam    spam    nonspam spam    spam    spam    spam   
##  [401] spam    nonspam spam    spam    spam    spam    nonspam spam   
##  [409] spam    nonspam nonspam spam    spam    spam    spam    nonspam
##  [417] spam    spam    spam    spam    spam    nonspam nonspam nonspam
##  [425] spam    spam    spam    nonspam spam    spam    spam    spam   
##  [433] nonspam spam    spam    spam    spam    nonspam spam    spam   
##  [441] spam    spam    spam    spam    spam    spam    spam    spam   
##  [449] spam    spam    spam    spam    spam    nonspam nonspam nonspam
##  [457] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [465] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [473] nonspam nonspam nonspam nonspam nonspam nonspam spam    nonspam
##  [481] spam    nonspam nonspam nonspam nonspam nonspam spam    nonspam
##  [489] nonspam nonspam nonspam nonspam nonspam spam    nonspam nonspam
##  [497] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [505] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [513] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [521] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [529] nonspam nonspam nonspam nonspam nonspam spam    nonspam nonspam
##  [537] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [545] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [553] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [561] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [569] nonspam spam    spam    nonspam nonspam nonspam spam    nonspam
##  [577] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [585] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [593] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [601] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [609] nonspam nonspam nonspam nonspam spam    nonspam nonspam nonspam
##  [617] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [625] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [633] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [641] nonspam nonspam nonspam nonspam nonspam spam    nonspam nonspam
##  [649] nonspam nonspam nonspam nonspam spam    nonspam nonspam nonspam
##  [657] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [665] nonspam nonspam nonspam nonspam nonspam nonspam spam    nonspam
##  [673] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [681] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [689] spam    nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [697] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [705] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [713] nonspam nonspam nonspam spam    nonspam nonspam nonspam nonspam
##  [721] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [729] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [737] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [745] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam   
##  [753] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [761] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [769] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [777] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [785] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [793] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [801] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [809] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [817] nonspam nonspam nonspam nonspam nonspam spam    nonspam nonspam
##  [825] nonspam nonspam spam    nonspam spam    nonspam nonspam nonspam
##  [833] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [841] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [849] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam   
##  [857] spam    nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [865] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [873] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [881] nonspam nonspam nonspam nonspam spam    nonspam nonspam nonspam
##  [889] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [897] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [905] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [913] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [921] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [929] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [937] nonspam nonspam spam    nonspam nonspam nonspam nonspam nonspam
##  [945] nonspam nonspam nonspam nonspam nonspam spam    nonspam nonspam
##  [953] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [961] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [969] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam   
##  [977] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [985] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
##  [993] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1001] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1009] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1017] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1025] nonspam nonspam nonspam spam    nonspam nonspam nonspam nonspam
## [1033] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam   
## [1041] nonspam nonspam nonspam nonspam nonspam nonspam nonspam spam   
## [1049] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1057] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1065] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1073] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1081] nonspam nonspam spam    nonspam nonspam nonspam nonspam spam   
## [1089] nonspam nonspam nonspam nonspam nonspam nonspam spam    nonspam
## [1097] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1105] nonspam nonspam nonspam nonspam nonspam spam    nonspam spam   
## [1113] spam    spam    nonspam nonspam nonspam nonspam nonspam nonspam
## [1121] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1129] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1137] nonspam nonspam nonspam nonspam nonspam nonspam nonspam nonspam
## [1145] nonspam nonspam nonspam nonspam nonspam nonspam
## Levels: nonspam spam
confusionMatrix(predictions, testing$type)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     663   65
##    spam         34  388
##                                           
##                Accuracy : 0.9139          
##                  95% CI : (0.8962, 0.9295)
##     No Information Rate : 0.6061          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8175          
##  Mcnemar's Test P-Value : 0.002569        
##                                           
##             Sensitivity : 0.9512          
##             Specificity : 0.8565          
##          Pos Pred Value : 0.9107          
##          Neg Pred Value : 0.9194          
##              Prevalence : 0.6061          
##          Detection Rate : 0.5765          
##    Detection Prevalence : 0.6330          
##       Balanced Accuracy : 0.9039          
##                                           
##        'Positive' Class : nonspam         
## 
  • Now we used the predict command to test new smaples
  • We pass the modeFit we fit before and the new data to test on
  • We calculate the confusionMatrix to evaluate the predictive power

Data slicing

  • Used for building testing and training data sets and with cross validation
  • Basically the first thing we did in the above code
library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y = spam$type, p = 0.75, list = FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
dim(training)

K-fold

  • First step in cross-validation
  • Split data set into smalled sets
set.seed(32323)
folds <- createFolds(y = spam$type, k = 10, list = TRUE, returnTrain = TRUE)
sapply(folds, length)
## Fold01 Fold02 Fold03 Fold04 Fold05 Fold06 Fold07 Fold08 Fold09 Fold10 
##   4141   4140   4141   4142   4140   4142   4141   4141   4140   4141
folds[[1]][1:10]
##  [1]  1  2  3  4  5  6  7  8  9 10
  • You can change returnTrain to FALSE in order to see the testing set

Resampling

  • You tell it how many times to resample data (this is with replacement)
set.seed(32323)
folds <- createResample(y = spam$type, times = 10, list = TRUE)
sapply(folds, length)
## Resample01 Resample02 Resample03 Resample04 Resample05 Resample06 
##       4601       4601       4601       4601       4601       4601 
## Resample07 Resample08 Resample09 Resample10 
##       4601       4601       4601       4601

Time slices

  • You can check time slices where you take continual time slices in time
set.seed(32323)
tme <- 1:1000
folds <- createTimeSlices(y = tme, initialWindow = 20, horizon = 10)
names(folds)
## [1] "train" "test"
folds$train[[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
folds$test[[1]]
##  [1] 21 22 23 24 25 26 27 28 29 30

Training options

  • Beign with same example as above (the first one)
  • This is the typical case
library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y = spam$type, p = 0.75, list = FALSE)

training <- spam[inTrain, ]
testing <- spam[-inTrain, ]
modelFit <- train(type ~ ., data = training, method = "glm")

Default train options

  • Some other options
    • preProcess
    • weights
    • metric
    • trControl
args(train.default)
## function (x, y, method = "rf", preProcess = NULL, ..., weights = NULL, 
##     metric = ifelse(is.factor(y), "Accuracy", "RMSE"), maximize = ifelse(metric == 
##         "RMSE", FALSE, TRUE), trControl = trainControl(), tuneGrid = NULL, 
##     tuneLength = 3) 
## NULL
  • The metric options are built into the train function
  • COntinuous outcomes
    • RMSE = root mean squared error
    • RSquared = R2
  • Categorical outcomes
    • Accuracy = Fraction correct
    • Kappa = a measure of concordance

trainControl

args(trainControl)
## function (method = "boot", number = ifelse(grepl("cv", method), 
##     10, 25), repeats = ifelse(grepl("cv", method), 1, number), 
##     p = 0.75, initialWindow = NULL, horizon = 1, fixedWindow = TRUE, 
##     verboseIter = FALSE, returnData = TRUE, returnResamp = "final", 
##     savePredictions = FALSE, classProbs = FALSE, summaryFunction = defaultSummary, 
##     selectionFunction = "best", preProcOptions = list(thresh = 0.95, 
##         ICAcomp = 3, k = 5), index = NULL, indexOut = NULL, timingSamps = 0, 
##     predictionBounds = rep(FALSE, 2), seeds = NA, adaptive = list(min = 5, 
##         alpha = 0.05, method = "gls", complete = TRUE), allowParallel = TRUE) 
## NULL
  • Many metrics specifying training
    • method for resampling
    • number of times to bootstrap/crossvalidate
    • number of times to repear whole proces
    • size of training set
    • initialWindow/horizon
    • summary functions
    • preprocessing options
    • seeds
    • and more
  • Resampling
    • method
      • boot = bootstrapping
      • boot632 = with adjustment
      • cv = cross validation
      • repeatedcv = repeated cross validation
      • LOOCV = leave one out cv
    • number
      • For boot/croos validation
      • number of subsamples to take
    • repeats
      • number of tmies to repeat subsampling
      • if big this can slow things down
  • You can set the seed for overall procedure and also for each resample

Plotting predictors

  • It is good to plot after testing/training models
library(ISLR); library(ggplot2); library(caret)
data(Wage)
summary(Wage)
##       year           age               sex                    maritl    
##  Min.   :2003   Min.   :18.00   1. Male  :3000   1. Never Married: 648  
##  1st Qu.:2004   1st Qu.:33.75   2. Female:   0   2. Married      :2074  
##  Median :2006   Median :42.00                    3. Widowed      :  19  
##  Mean   :2006   Mean   :42.41                    4. Divorced     : 204  
##  3rd Qu.:2008   3rd Qu.:51.00                    5. Separated    :  55  
##  Max.   :2009   Max.   :80.00                                           
##                                                                         
##        race                   education                     region    
##  1. White:2480   1. < HS Grad      :268   2. Middle Atlantic   :3000  
##  2. Black: 293   2. HS Grad        :971   1. New England       :   0  
##  3. Asian: 190   3. Some College   :650   3. East North Central:   0  
##  4. Other:  37   4. College Grad   :685   4. West North Central:   0  
##                  5. Advanced Degree:426   5. South Atlantic    :   0  
##                                           6. East South Central:   0  
##                                           (Other)              :   0  
##            jobclass               health      health_ins      logwage     
##  1. Industrial :1544   1. <=Good     : 858   1. Yes:2083   Min.   :3.000  
##  2. Information:1456   2. >=Very Good:2142   2. No : 917   1st Qu.:4.447  
##                                                            Median :4.653  
##                                                            Mean   :4.654  
##                                                            3rd Qu.:4.857  
##                                                            Max.   :5.763  
##                                                                           
##       wage       
##  Min.   : 20.09  
##  1st Qu.: 85.38  
##  Median :104.92  
##  Mean   :111.70  
##  3rd Qu.:128.68  
##  Max.   :318.34  
## 
  • Build training and test set
inTrain <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)

training <- Wage[inTrain, ]
testing <- Wage[-inTrain, ]
dim(training); dim(testing)
## [1] 2102   12
## [1] 898  12
  • Feature plot
featurePlot(x = training[, c("age", "education", "jobclass")], 
    y = training$wage, plot = "pairs")

  • Or with ggplot2
qplot(age, wage, data = training)

qq <- qplot(age, wage, colour = jobclass, data = training)
qq + geom_smooth(method = "lm", formula = y ~ x)

  • It can be useful to cut the dataset up into groups
cutWage <- cut2(training$wage, g = 3)
table(cutWage)
## cutWage
## [ 20.1, 91.7) [ 91.7,118.9) [118.9,318.3] 
##           703           721           678
p1 <- qplot(cutWage, age, data = training, fill = cutWage, geom = c("boxplot"))
p1

  • It is also useful to look at tables of the data
t1 <- table(cutWage, training$jobclass)
t1
##                
## cutWage         1. Industrial 2. Information
##   [ 20.1, 91.7)           437            266
##   [ 91.7,118.9)           373            348
##   [118.9,318.3]           274            404
prop.table(t1, 1)
##                
## cutWage         1. Industrial 2. Information
##   [ 20.1, 91.7)     0.6216216      0.3783784
##   [ 91.7,118.9)     0.5173370      0.4826630
##   [118.9,318.3]     0.4041298      0.5958702
  • density plots are also useful
qplot(wage, colour = education, data = training, geom = "density")

  • Make plots in training set
  • Things to look for
    • imbalance in outcomes/predictors
    • outliers
    • groups of points not explained by a predictor
    • skewed variables

Basic preprocessing

  • After plotting and analyzing variables, you may need to do some preprocessing
  • Why do it?
inTrain <- createDataPartition(y = spam$type, p = 0.75, list = FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]

hist(training$capitalAve, main = "", xlab = "ave. capital run length")

  • Data are very skewed, preprocessing is probably a good option
  • FOr example, the mean of captilAve is 5.554007 and the standard deviation is 36.1469598. This is highly variable and we mihgt want to preprocess so that we don’t trick the machine learning algorythm
  • Standardizing
    • subtract mean and divide by the standard deviation (mean = 0, sd = 1)
    • In order to standardize the test set we need to subtrac the mean of the training set and divideby the sd of the training set… this is important
      • the mean will not be exactly 0 and the sd will not be exactly 1
  • there is a preProcess function
preObj <- preProcess(training[, -58], method = c("center", "scale"))
trainCapAveS <- predict(preObj, training[, -58])$capitalAve
mean(trainCapAveS)
## [1] 4.226973e-18
sd(trainCapAveS)
## [1] 1
  • You can therefore preprocess the training set and use this to preprocess the test set
testCapAveS <- predict(preObj, testing[, -58])$capitalAve
mean(testCapAveS)
## [1] -0.04012184
sd(testCapAveS)
## [1] 0.2844746
  • The preProcess command can also be used as an arguement in the train function
  • You can also perform other transforms
preObj <- preProcess(training[, -58], method = c("BoxCox"))
trainCapAveS <- predict(preObj, training[, -58])$capitalAve
par(mfrow = c(1, 2)); hist(trainCapAveS); qqnorm(trainCapAveS)

  • Compare the above transformation to the histogram that was skewed right (2 or 3 above)

Imputing data

  • We can impute if there is a lot of missing data
set.seed(13343)

# make som values NA
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1], size = 1, prob = 0.05)==1
training$capAve[selectNA] <- NA

# Impute and standardize
preObj <- preProcess(training[, -58], method = "knnImpute")
capAve <- predict(preObj, training[, -58])$capAve

# Standardize true values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth - mean(capAveTruth))/sd(capAveTruth)
  • We can see how well it worked by looking at the quantiles
quantile(capAve - capAveTruth)
##            0%           25%           50%           75%          100% 
## -1.2316629250 -0.0007259377  0.0003077242  0.0008111265  0.1296147093
quantile(capAve - capAveTruth)[selectNA]
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 
##   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA 
## <NA> 
##   NA
quantile((capAve - capAveTruth)[!selectNA])
##            0%           25%           50%           75%          100% 
## -0.7573602868 -0.0006708458  0.0003025593  0.0007748003  0.0011864402
  • Training and test sets should be processed in the same way

Covariate creation

  • covariates are variables that describe the data
    • also called predictor or features
  • 2 steps for creating covariates
    1. from raw data to covariate
      • the average number of capitcals in an email, number of dollar signs, etc.
      • depends on application
      • summarization vs. feature loss
      • more knowledge of system = better results
      • better to err on the side of more features
      • process can be automated, but must be carefull
    2. transforming tidy covariates
      • ex. square average number of capitals
      • should only be done on the training set
      • best approach is exploratory analysis
inTrain <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)
training <- Wage[inTrain, ]; testing <- Wage[-inTrain, ]
  • Common covariate to add = dummy variables
dummies <- dummyVars(wage ~ jobclass, data = training)
head(predict(dummies, newdata = training))
##        jobclass.1. Industrial jobclass.2. Information
## 86582                       0                       1
## 11443                       0                       1
## 160191                      1                       0
## 11141                       0                       1
## 229379                      1                       0
## 86064                       1                       0
  • Common to remove non-significant (relevant) covariates
nsv <- nearZeroVar(training, saveMetrics = TRUE)
print(nsv)
##            freqRatio percentUnique zeroVar   nzv
## year        1.037356    0.33301618   FALSE FALSE
## age         1.027027    2.85442436   FALSE FALSE
## sex         0.000000    0.04757374    TRUE  TRUE
## maritl      3.272931    0.23786870   FALSE FALSE
## race        8.938776    0.19029496   FALSE FALSE
## education   1.389002    0.23786870   FALSE FALSE
## region      0.000000    0.04757374    TRUE  TRUE
## jobclass    1.000000    0.09514748   FALSE FALSE
## health      2.468647    0.09514748   FALSE FALSE
## health_ins  2.352472    0.09514748   FALSE FALSE
## logwage     1.061728   19.17221694   FALSE FALSE
## wage        1.061728   19.17221694   FALSE FALSE
  • SPlines
library(splines)
bsBasis <- bs(training$age, df = 3)
head(bsBasis)
##              1          2           3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.3625256 0.38669397 0.137491189
## [3,] 0.4422183 0.19539878 0.028779665
## [4,] 0.4430868 0.24369776 0.044677923
## [5,] 0.4308138 0.29109043 0.065560908
## [6,] 0.4261690 0.14823269 0.017186399
lm1 <- lm(wage ~ bsBasis, data = training)
plot(training$age, training$wage, pch = 19, cex = 0.5)
points(training$age, predict(lm1, newdata = training), col = 'red', 
    pch = 19, cex = 0.5)

  • This must also be done on the test set
head(predict(bsBasis, age = testing$age))
##              1          2           3
## [1,] 0.2368501 0.02537679 0.000906314
## [2,] 0.3625256 0.38669397 0.137491189
## [3,] 0.4422183 0.19539878 0.028779665
## [4,] 0.4430868 0.24369776 0.044677923
## [5,] 0.4308138 0.29109043 0.065560908
## [6,] 0.4261690 0.14823269 0.017186399
  • Level 1 feature creation = raw data to covariates
    • you need to get something to measure
    • you can google “feature extraction for X”
  • Level 2 feature creation = covariates to new covariates
    • preProcess functin
    • exploratory analysis
    • you can create lots of features, but careful of overfitting (you need to remove the irrelevant ones later)

Preprocessing with principal components analysis

  • Sometimes we have multicolinearity
  • In extreme cases they are practically the same variable
  • We can remove them
  • This shows us the variables that have a correlation of over 0.8
library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y = spam$type, p = 0.75, list = FALSE)
training <- spam[inTrain, ]
testing <- spam[-inTrain, ]

M <- abs(cor(training[,-58]))
diag(M) <- 0
which(M > 0.8, arr.ind = TRUE)
##        row col
## num415  34  32
## direct  40  32
## num857  32  34
## direct  40  34
## num857  32  40
## num415  34  40
  • Idea is to find variables that are highly correlated and turn them into one variable (or remove them)
  • This can be done with a weighted combination of predictors
  • The combination should capture the most info possible
  • Benefits
    • reduce number of predictors
    • reduce noise (due to averaging)
  • SVD

  • PCA

smallSpam <- spam[, c(32, 34)]
prComp <- prcomp(smallSpam)
plot(prComp$x[, 1], prComp$x[, 2])

  • we can apply this to the entire dataset
typeColor <- ((spam$type == "spam") * 1 + 1)
prComp <- prcomp(log10(spam[, -58] + 1))
plot(prComp$x[, 1], prComp$x[, 2], col = typeColor, xlab = "PC1", 
    ylab = "PC2")

Predicting with regression

  • Fit a model
  • Plug in new covariates and multiply by the coeff
  • Useful if model is good
  • Easy to do, easy to interpret
  • Not very good if relationship is not linear
  • normally used with other machine learning algorhytms
  • We will show an example using gysers
library(caret); data(faithful); set.seed(333)
inTrain <- createDataPartition(y = faithful$waiting, p = 0.5, list = FALSE)
trainFaith <- faithful[inTrain, ]; testFaith <- faithful[-inTrain, ]
head(trainFaith)
##    eruptions waiting
## 6      2.883      55
## 11     1.833      54
## 16     2.167      52
## 19     1.600      52
## 22     1.750      47
## 27     1.967      55
  • If we plot the data we can see that there appears to be a linear relationship
  • Next we fit a model
lm1 <- lm(eruptions ~ waiting, data = trainFaith)
summary(lm1)
## 
## Call:
## lm(formula = eruptions ~ waiting, data = trainFaith)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26990 -0.34789  0.03979  0.36589  1.05020 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.792739   0.227869  -7.867 1.04e-12 ***
## waiting      0.073901   0.003148  23.474  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.495 on 135 degrees of freedom
## Multiple R-squared:  0.8032, Adjusted R-squared:  0.8018 
## F-statistic:   551 on 1 and 135 DF,  p-value: < 2.2e-16
par(mfrow = c(1, 2))
plot(trainFaith$waiting, trainFaith$eruptions)
plot(trainFaith$waiting, trainFaith$eruptions)
lines(trainFaith$waiting, lm1$fitted)

  • Now we can predict using a new wait time
  • this basically takes the intercept plus the slope and multiplies by the new value \[\beta_0 + \beta_1*NEWVAL\]
  • We dont have to actually do this, we can use the predict function
newdata <- data.frame(waiting = 80)
predict(lm1, newdata)
##        1 
## 4.119307
  • It is useful to plot the test data and fit the line derived from the training data to get a dirty, quick idea of how good the fit is
par(mfrow = c(1, 2))
plot(trainFaith$waiting, trainFaith$eruptions)
lines(trainFaith$waiting, predict(lm1))
plot(testFaith$waiting, testFaith$eruptions)
lines(testFaith$waiting, predict(lm1, newdata = testFaith))

  • We can see that they are slighly different
  • Now we will calculate the root mean squared error to get an idea of how well it performed
# RMSE train
sqrt(sum((lm1$fitted - trainFaith$eruptions)^2))
## [1] 5.75186
# RMSE test
sqrt(sum((predict(lm1, newdata = testFaith) - testFaith$eruptions)^2))
## [1] 5.838559
  • We can replot this including prediction intervals
pred1 <- predict(lm1, newdata = testFaith, interval = "prediction")
ord <- order(testFaith$waiting)
plot(testFaith$waiting, testFaith$eruptions)
matlines(testFaith$waiting[ord], pred1[ord, ], type = "l")

  • New example with multiple predicters
  • We will do it using caret this time
library(ISLR)
data(Wage); wage <- subset(Wage, select = -c(logwage))
summary(wage)
##       year           age               sex                    maritl    
##  Min.   :2003   Min.   :18.00   1. Male  :3000   1. Never Married: 648  
##  1st Qu.:2004   1st Qu.:33.75   2. Female:   0   2. Married      :2074  
##  Median :2006   Median :42.00                    3. Widowed      :  19  
##  Mean   :2006   Mean   :42.41                    4. Divorced     : 204  
##  3rd Qu.:2008   3rd Qu.:51.00                    5. Separated    :  55  
##  Max.   :2009   Max.   :80.00                                           
##                                                                         
##        race                   education                     region    
##  1. White:2480   1. < HS Grad      :268   2. Middle Atlantic   :3000  
##  2. Black: 293   2. HS Grad        :971   1. New England       :   0  
##  3. Asian: 190   3. Some College   :650   3. East North Central:   0  
##  4. Other:  37   4. College Grad   :685   4. West North Central:   0  
##                  5. Advanced Degree:426   5. South Atlantic    :   0  
##                                           6. East South Central:   0  
##                                           (Other)              :   0  
##            jobclass               health      health_ins  
##  1. Industrial :1544   1. <=Good     : 858   1. Yes:2083  
##  2. Information:1456   2. >=Very Good:2142   2. No : 917  
##                                                           
##                                                           
##                                                           
##                                                           
##                                                           
##       wage       
##  Min.   : 20.09  
##  1st Qu.: 85.38  
##  Median :104.92  
##  Mean   :111.70  
##  3rd Qu.:128.68  
##  Max.   :318.34  
## 
  • we exluded the variable we are going to try to predict
  • now we subset into training and test sets
inTrain <- createDataPartition(y = wage$wage, p = 0.7, list = FALSE)
training <- wage[inTrain, ]; testing <- wage[-inTrain, ]
dim(training); dim(testing)
## [1] 2102   11
## [1] 898  11
  • First we do a feature plot to get an idea of how the variable are related to each other
featurePlot(x = training[, c("age", "education", "jobclass")], y = training$wage, plot = "pairs")

  • Let’s plot and take a look
qplot(age, wage, colour = jobclass, data = training)

qplot(age, wage, colour = education, data = training)

  • It looks like most of the points at the top are blue, meaining they come from the information variable
  • The advanced degree seems to also explain the points at the top
  • Now we fit a model
modFit <- train(wage ~ age + jobclass + education, method = "lm", data = training)
finMod <- modFit$finalModel
print(modFit)
## Linear Regression 
## 
## 2102 samples
##   10 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ... 
## 
## Resampling results
## 
##   RMSE      Rsquared  RMSE SD   Rsquared SD
##   36.34569  0.275526  1.281115  0.02146238 
## 
## 
  • Now we want to make some diagnostics plots to check the residuals
plot(finMod, 1, pch = 19, cex = 0.5, col = "#00000010")

qplot(finMod$fitted, finMod$residuals, colour = race, data = training)

plot(finMod$residuals)

pred <- predict(modFit, testing)
qplot(wage, pred, colour = year, data = testing)

Predicting with trees

  • You can use the variables into groups
  • eval homogeneity within each group
  • split again if necessary
  • easy to interpret, better for nonlinear relationship
  • can lead to overfitting w/o pruning or CV
  • hard to est. uncertainty, results may be variable
  • Method
    • start with all variables
    • find the best variable
    • divide data into two roups (leaves) on the that split (node)
    • within each split, find best variable/split that separates the outcomes
    • continue until the groups are too small or sufficiently pure
  • Measure of impurity
    • misclassification error (1 - p)
      • 0 = perfect
      • 0.5 = no purity
    • gini index
      • same as above
    • deviance/information gain
      • 0 = perfect
      • 1 = no purity
  • Example with iris data
data(iris)
table(iris$Species)
## 
##     setosa versicolor  virginica 
##         50         50         50
inTrain <- createDataPartition(y = iris$Species, p = 0.7, list = FALSE)
training <- iris[inTrain, ]; testing <- iris[-inTrain, ]
dim(training); dim(testing)
## [1] 105   5
## [1] 45  5
qplot(Petal.Width, Sepal.Width, colour = Species, data = training)

  • It looks like the relationship is not linear (probably not good for regression)
modFit <- train(Species ~ ., method = "rpart", data = training)
## Loading required package: rpart
print(modFit$finalModel)
## n= 105 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 35  0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 70 35 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 36  2 versicolor (0.00000000 0.94444444 0.05555556) *
##     7) Petal.Width>=1.75 34  1 virginica (0.00000000 0.02941176 0.97058824) *
plot(modFit$finalModel, uniform = TRUE)
text(modFit$finalModel, use.n = TRUE, all = TRUE)

library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 3.3.0 Copyright (c) 2006-2014 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(modFit$finalModel)

  • We can predict new values using the predict function
predict(modFit, newdata = testing)
##  [1] setosa     setosa     setosa     setosa     setosa     setosa    
##  [7] setosa     setosa     setosa     setosa     setosa     setosa    
## [13] setosa     setosa     setosa     versicolor versicolor versicolor
## [19] versicolor versicolor versicolor versicolor versicolor versicolor
## [25] versicolor versicolor versicolor versicolor versicolor versicolor
## [31] virginica  versicolor virginica  virginica  versicolor virginica 
## [37] virginica  virginica  virginica  virginica  versicolor virginica 
## [43] virginica  virginica  virginica 
## Levels: setosa versicolor virginica

Bootstrap aggregating (Bagging)

  • When you fit complicated model, you can average them together to get a smoother/better fit
  • resample cases and recalculate predictions
  • average predictions or majority vote
  • similar bias from fitting each individual bias, but reduced variability
library(ElemStatLearn); data(ozone, package = "ElemStatLearn")
## 
## Attaching package: 'ElemStatLearn'
## 
## The following object is masked _by_ '.GlobalEnv':
## 
##     spam
ozone <- ozone[order(ozone$ozone), ]
head(ozone)
##     ozone radiation temperature wind
## 17      1         8          59  9.7
## 19      4        25          61  9.7
## 14      6        78          57 18.4
## 45      7        48          80 14.3
## 106     7        49          69 10.3
## 7       8        19          61 20.1

Bagged loess

ll <- matrix(NA, nrow = 10, ncol = 155)
for(i in 1:10){
    ss <- sample(1:dim(ozone)[1], replace = T)
    ozone0 <- ozone[ss, ]; ozone0 <- ozone0[order(ozone0$ozone), ]
    loess0 <- loess(temperature ~ ozone, data = ozone0, span = 0.2)
    ll[i, ] <- predict(loess0, newdata = data.frame(ozone = 1:155))
}

plot(ozone$ozone, ozone$temperature, pch = 19, cex = 0.5)
for(i in 1:10){lines(1:155, ll[i, ], col = "grey", lwd = 2)}
lines(1:155, apply(ll, 2, mean), col = "red", lwd = 2)

  • Bagging can also be accomplished in caret (not shown here)

Random forests

  • Similar to bagging
  • boostrap samples
  • at each split, also bootstrap variables
  • grow lots of trees, vote or average in order to get a prediction for a new outcome
  • Highly accurate
  • slow, often difficult to interpret, can overfit
inTrain <- createDataPartition(y = iris$Species, p = 0.7, list = FALSE)
training <- iris[inTrain, ]; testing <- iris[-inTrain, ]

modFit <- train(Species ~ ., data = training, method = "rf", prox = TRUE)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:Hmisc':
## 
##     combine
modFit
## Random Forest 
## 
## 105 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 105, 105, 105, 105, 105, 105, ... 
## 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa      Accuracy SD  Kappa SD  
##   2     0.9574476  0.9354405  0.03002109   0.04553321
##   3     0.9531950  0.9290007  0.03262020   0.04953601
##   4     0.9531034  0.9288856  0.03242096   0.04902118
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.
irisP <- classCenter(training[, c(3, 4)], training$Species, modFit$finalModel$prox)
irisP <- as.data.frame(irisP); irisP$Species <- rownames(irisP)
p <- qplot(Petal.Width, Petal.Length, col = Species, data = training)
p + geom_point(aes(x = Petal.Width, y = Petal.Length, col = Species), size = 5, shape = 4, data = irisP)

  • The model is highly accurate
  • We can use it to predict new values
pred <- predict(modFit, testing)
testing$predRight <- pred == testing$Species
table(pred, testing$Species)
##             
## pred         setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         14         1
##   virginica       0          1        14
qplot(Petal.Width, Petal.Length, colour = predRight, data = testing, main = "newdata Prediction")

  • We can see that we missed two, the plot shows which

Boosting

  • Also very accurate
  • Takes large number of predictors, weights them, and adds them together and average to get a stronger predictor
  • Take set of classifiers, create a classifier that combines the clasification functions
  • Goal is to minimize error
  • We can boost with any subset of classifiers
  • There are many libraries to do this is R (see videos)
  • Wage example
Wage <- subset(Wage, select = -c(logwage))
inTrain <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)
training <- Wage[inTrain, ]; testing <- Wage[-inTrain, ]

modFit <- train(wage ~ ., method = "gbm", data = training, verbose = FALSE)
## Loading required package: gbm
## Loading required package: parallel
## Loaded gbm 2.1
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## 
## The following object is masked _by_ '.GlobalEnv':
## 
##     ozone
## 
## The following object is masked from 'package:ElemStatLearn':
## 
##     ozone
## 
## The following objects are masked from 'package:Hmisc':
## 
##     is.discrete, summarize
print(modFit)
## Stochastic Gradient Boosting 
## 
## 2102 samples
##   10 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## 
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ... 
## 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE      Rsquared   RMSE SD   Rsquared SD
##   1                   50      34.88119  0.3354209  1.580301  0.02651963 
##   1                  100      34.24918  0.3460333  1.559129  0.02628697 
##   1                  150      34.13728  0.3484395  1.523833  0.02698527 
##   2                   50      34.14742  0.3514951  1.562077  0.02677109 
##   2                  100      34.01654  0.3533109  1.511470  0.02645130 
##   2                  150      34.05669  0.3517570  1.517989  0.02755649 
##   3                   50      34.01561  0.3541818  1.573323  0.02701083 
##   3                  100      34.12517  0.3492055  1.511990  0.02609524 
##   3                  150      34.30884  0.3433877  1.506589  0.02553441 
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were n.trees = 50, interaction.depth
##  = 3 and shrinkage = 0.1.
qplot(predict(modFit, testing), wage, data = testing)

Model based prediction

  • Assume data follow a probabilistic model
  • Use Bayes theorem to indentify optimal classifiers
  • Takes advantage of structure of data
  • accurate on ‘real’ problems
  • made additional assumptions about data
  • if model is incorrect = reduced accuracy

Model based approach

  • Rather complicated (see video if ever interested)
  • Idea is that many different types of classifying models use the same approach here
    • LDA
    • QDA
  • Model based prediction allows for model complicated stuff
  • WHy LDA? Not clear to me, but I don’t really do this type of stuff so I’m not too concerned

Naive Bayes

  • Use Bayes theorem…
    • “Naive Bayes is one of the simplest classification algorithms, but it can also be very accurate. At a high level, Naive Bayes tries to classify instances based on the probabilities of previously seen attributes/instances, assuming complete attribute independence. Oddly enough, this usually works out to give you a good classifier.”
  • Example
inTrain <- createDataPartition(y = iris$Species, p = 0.7, list = FALSE)
training <- iris[inTrain, ]
testing <- iris[-inTrain, ]
dim(training); dim(testing)
## [1] 105   5
## [1] 45  5
modlda <- train(Species ~ ., data = training, method = "lda")
# modnb  <- train(Species ~ ., data = training, method = "nb")
plda = predict(modlda, testing)
# pnb = predict(modnb, testing)
# table(plda, pnb)

# equalPredictions = (plda == pnb)
# qplot(Petal.Width, Sepal.Width, colour = equalPredictions, data = testing)
  • Naive bayes doesnt seem to work

Regularized regression

  • Fit a regression model
  • Penalize the coefficients to eliminate bias
  • This can help with model selection
  • COmputationally demanding
  • not as good as random forests or boosting
  • subset data into training, test and validation
  • Very technical… no good examples
  • Large emphasis on the penalization

Combining predictors

  • You can combine classifiers by averaging or voting
  • this improves accuracy
  • reduces interpretability
  • boosting, bagging and random forests are variants on this theme that use the same type of classifier
  • Its like a majority vote
  • Classifiers can be combined using stacking and ensembling
data(Wage); wage <- subset(Wage, select = -c(logwage))

inBuild <- createDataPartition(y = Wage$wage, p = 0.7, list = FALSE)
validation <- Wage[-inBuild, ]
buildData <- Wage[inBuild, ]

inTrain <- createDataPartition(y = buildData$wage, p = 0.7, list = FALSE)
training <- buildData[inTrain, ]
tresting <- buildData[-inTrain, ]

mod1 <- train(wage ~ ., method = "glm", data = training)
mod2 <- train(wage ~ ., method = "rf", data = training, trControl = trainControl(method = "cv"), numbers = 3)

pred1 <- predict(mod1, testing)
pred2 <- predict(mod2, testing)

qplot(pred1, pred2, colour = wage, data = testing)

# The two models to completely agree
# We can try fitting a model that combines predictors

predDF <- data.frame(pred1, pred2, wage = testing$wage)
combModFit <- train(wage ~ ., method = "gam", data = predDF)
combPred <- predict(combModFit, predDF)

# check errors
sqrt(sum((pred1 - testing$wage)^2))
sqrt(sum((pred2 - testing$wage)^2))
sqrt(sum((combPred - testing$wage)^2))

# predict on validation set
pred1V <- predict(mod1, validation)
pred2V <- predict(mod2, validation)

predVDF <- data.frame(pred1 = pred1V, pred2 = pred2V)
combPredV <- predict(combModFit, predVDF)

sqrt(sum((pred1V - validation$wage)^2))
sqrt(sum((pred2V - validation$wage)^2))
sqrt(sum((combPredV - validation$wage)^2))
  • We see that in each case the stacked model is more accurate

Forecasting

  • Usually used with time series data
  • data are dependent over time
  • must look for specific pattern types
    • trends
    • seasonal patterns
    • cycles
  • subsampling is more difficult (because of times)
  • same issues in spatial data
  • goal is to predict one or more predictors in the future… all previous methods can be used
library(quantmod)
from.dat <- as.Date("01/01/08", format = "%m/%d/%y")
to.dat <- as.Date("12/31/13", format = "%m/%d/%y")
getSymbols("GOOG", src = "google", from = from.dat, to = to.dat)
head(GOOG)

# summarize monthly and store as time series
mGoog <- to.monthly(GOOG)
googOpen <- Op(mGoog)
ts1 <- ts(googOpen, frequency = 12)
plot(ts1, xlab = "Years +1", ylab = "GOOG")
  • This was a quick and dirty overview.
  • A lot of the code didn’t work

Unsupervised prediction

  • Sometimes you don’t know the labels for prediction
  • You can build clusters, give them names, and build predictors for them
  • There is usually more error here, but it is useful when you know less about the data
  • This is an exploratory technique and you have to becareful when interpreting
data(iris)
inTrain <- createDataPartition(y = iris$Species, p = 0.7, list = FALSE)
training <- iris[inTrain, ]
testing <- iris[-inTrain, ]

kMeans1 <- kmeans(subset(training, select=-c(Species)), centers = 3)
training$clusters <- as.factor(kMeans1$cluster)
qplot(Petal.Width, Petal.Length, colour = clusters, data = training)

table(kMeans1$cluster, training$Species)
##    
##     setosa versicolor virginica
##   1     17          0         0
##   2      0         33        35
##   3     18          2         0
modFit <- train(clusters ~ ., data = subset(training, select=-c(Species)), method = "rpart")
table(predict(modFit, training), training$Species)
##    
##     setosa versicolor virginica
##   1     21          1         0
##   2      0         33        35
##   3     14          1         0
testClusterPred <- predict(modFit, testing)
table(testClusterPred, testing$Species)
##                
## testClusterPred setosa versicolor virginica
##               1      7          1         0
##               2      0         14        15
##               3      8          0         0

Developing Data Products

Manipulate

  • Basically shiny, but only works in RStudio
  • Silly example
library(manipulate)
myHist <- function(mu){
    hist(galton$child, col = "blue", breaks = 100)
    lines(c(my, my), c(0, 150), col = "red", lwd = 5)
    mse <- mean((galton$child - mu)^2)
    text(63, 150, paste("mu = ", my))
    text(63, 140, paste("MSE = ", round(mse, 2)))
}
manipulate(myHist(mu), my = slider(62, 74, step = 0.5))

rCharts

googleVis

RStudio presenter

  • Similar to IOslides presentations
  • Allows for inline r code and renders as an HTML5 file

R Packages